diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 0897d7f37c..da5fa39335 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -3,41 +3,43 @@
 # Analysis period: 180 days
 # Minimum commits threshold: 1
 
-benchmarks/ @bkryu @cyx-6 @nv-yunzheq @kahyunnam @jiahanc
-benchmarks/routines/ @bkryu @nv-yunzheq @cyx-6 @nvmbreughe @Anerudhan
+benchmarks/ @bkryu @jiahanc @cyx-6 @kahyunnam @yzh119
+benchmarks/routines/ @bkryu @nv-yunzheq @jiahanc @cyx-6 @nvmbreughe
 ci/ @cyx-6 @yzh119 @nvmbreughe
 ci/scripts/ @cyx-6
 ci/scripts/jenkins/ @cyx-6
-csrc/ @wenscarl @yzh119 @cyx-6 @djmmoss @yongwww
-csrc/fused_moe/ @yzh119 @yongwww @djmmoss @wenscarl @cyx-6
-csrc/fused_moe/cutlass_backend/ @yzh119 @yongwww @djmmoss @wenscarl @cyx-6
-csrc/nv_internal/ @wenscarl @djmmoss @yzh119 @cyx-6 @yongwww
-csrc/nv_internal/cpp/ @wenscarl @yongwww @djmmoss @joker-eph @ttyio
-csrc/nv_internal/include/ @wenscarl
-csrc/nv_internal/tensorrt_llm/ @wenscarl @djmmoss @yzh119 @cyx-6 @yongwww
-csrc/xqa/ @yzh119 @cyx-6
-docs/ @yzh119 @cyx-6 @wenscarl @nv-yunzheq @aleozlx
-flashinfer/ @yzh119 @cyx-6 @wenscarl @nvmbreughe @bkryu
+csrc/ @yzh119 @wenscarl @djmmoss @cyx-6 @nv-yunzheq
+csrc/fused_moe/ @yzh119 @nv-yunzheq @djmmoss @wenscarl @yongwww
+csrc/fused_moe/cutlass_backend/ @yzh119 @nv-yunzheq @djmmoss @wenscarl @yongwww
+csrc/nv_internal/ @wenscarl @djmmoss @yzh119 @nv-yunzheq @yongwww
+csrc/nv_internal/cpp/ @wenscarl @bkryu @yongwww @djmmoss @joker-eph
+csrc/nv_internal/include/ @wenscarl @nv-yunzheq
+csrc/nv_internal/tensorrt_llm/ @wenscarl @djmmoss @yzh119 @nv-yunzheq @yongwww
+csrc/xqa/ @cyx-6 @yzh119
+docs/ @yzh119 @cyx-6 @bkryu @wenscarl @nv-yunzheq
+flashinfer/ @yzh119 @cyx-6 @wenscarl @nvmbreughe @aleozlx
 flashinfer-cubin/ @yzh119 @cyx-6
 flashinfer-cubin/flashinfer_cubin/ @yzh119
 flashinfer-jit-cache/ @yzh119 @cyx-6
 flashinfer-jit-cache/flashinfer_jit_cache/ @yzh119
 flashinfer/comm/ @yzh119 @cyx-6 @nvmbreughe @wenscarl @djmmoss
-flashinfer/cudnn/ @Anerudhan @yzh119 @cyx-6 @Anerudhan
+flashinfer/cudnn/ @Anerudhan @yzh119 @bkryu @cyx-6 @Anerudhan
 flashinfer/cute_dsl/ @yzh119 @kaixih @Amir-19 @aleozlx
-flashinfer/fused_moe/ @djmmoss @yzh119 @cyx-6 @wenscarl @IwakuraRein
-flashinfer/jit/ @yzh119 @cyx-6 @djmmoss @aleozlx @yongwww
-flashinfer/jit/attention/ @yzh119 @Anerudhan @joker-eph
-flashinfer/jit/gemm/ @yzh119
+flashinfer/dsv3_ops/ @nv-yunzheq @nvmbreughe
+flashinfer/fused_moe/ @yzh119 @nv-yunzheq @jiahanc @djmmoss @cyx-6
+flashinfer/gemm/ @nvmbreughe @bkryu
+flashinfer/jit/ @yzh119 @cyx-6 @aleozlx @nv-yunzheq @jiahanc
+flashinfer/jit/attention/ @yzh119 @cyx-6 @Anerudhan
+flashinfer/jit/gemm/ @yzh119 @nv-yunzheq @jiahanc
 flashinfer/logits_processor/ @cyx-6 @yzh119
 flashinfer/profiler/ @cyx-6
-flashinfer/triton/ @cyx-6 @nvmbreughe @yzh119
+flashinfer/triton/ @nvmbreughe @cyx-6
 flashinfer/tuning_configs/ @kaixih
-include/ @yzh119 @cyx-6 @wenscarl @kahyunnam @joker-eph
-include/flashinfer/ @yzh119 @cyx-6 @wenscarl @kahyunnam @joker-eph
+include/ @yzh119 @kahyunnam @jiahanc @IwakuraRein @nv-yunzheq
+include/flashinfer/ @yzh119 @kahyunnam @jiahanc @IwakuraRein @nv-yunzheq
 include/flashinfer/attention/ @yzh119 @kahyunnam @joker-eph
 include/flashinfer/comm/ @yongwww @nvmbreughe @djmmoss @yzh119 @cyx-6
-include/flashinfer/gemm/ @ttyio @yongwww @aleozlx @cyx-6
-include/flashinfer/trtllm/ @joker-eph @aleozlx @yzh119 @cyx-6 @wenscarl
+include/flashinfer/gemm/ @ttyio @yongwww @yzh119 @nvmbreughe @aleozlx
+include/flashinfer/trtllm/ @jiahanc @joker-eph @aleozlx @yzh119 @IwakuraRein
 profiler/ @cyx-6
-scripts/ @yzh119 @nvmbreughe @dierksen @yongwww @bkryu
+scripts/ @yzh119 @nvmbreughe @kahyunnam @dierksen @yongwww
diff --git a/.github/workflows/nightly-release.yml b/.github/workflows/nightly-release.yml
index 4d5acdfe63..7c57d4bd7a 100644
--- a/.github/workflows/nightly-release.yml
+++ b/.github/workflows/nightly-release.yml
@@ -145,7 +145,7 @@ jobs:
       - name: Build wheel in container
         env:
           DOCKER_IMAGE: ${{ matrix.arch == 'aarch64' && format('pytorch/manylinuxaarch64-builder:cuda{0}', matrix.cuda) || format('pytorch/manylinux2_28-builder:cuda{0}', matrix.cuda) }}
-          FLASHINFER_CUDA_ARCH_LIST: ${{ matrix.cuda == '12.8' && '7.5 8.0 8.9 9.0a 10.0a 12.0a' || '7.5 8.0 8.9 9.0a 10.0a 10.3a 12.0a' }}
+          FLASHINFER_CUDA_ARCH_LIST: ${{ matrix.cuda < '13.0' && '7.5 8.0 8.9 9.0a 10.0a 12.0a' || '7.5 8.0 8.9 9.0a 10.0a 10.3a 11.0a 12.0f' }}
           FLASHINFER_DEV_RELEASE_SUFFIX: ${{ needs.setup.outputs.dev_suffix }}
         run: |
           # Extract CUDA major and minor versions
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 7e406ff2ac..b11e72e1f7 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -182,7 +182,7 @@ jobs:
       - name: Build wheel in container
         env:
           DOCKER_IMAGE: ${{ matrix.arch == 'aarch64' && format('pytorch/manylinuxaarch64-builder:cuda{0}', matrix.cuda) || format('pytorch/manylinux2_28-builder:cuda{0}', matrix.cuda) }}
-          FLASHINFER_CUDA_ARCH_LIST: ${{ matrix.cuda == '12.8' && '7.5 8.0 8.9 9.0a 10.0a 12.0a' || '7.5 8.0 8.9 9.0a 10.0a 10.3a 12.0a' }}
+          FLASHINFER_CUDA_ARCH_LIST: ${{ matrix.cuda < '13.0' && '7.5 8.0 8.9 9.0a 10.0a 12.0a' || '7.5 8.0 8.9 9.0a 10.0a 10.3a 11.0a 12.0f' }}
         run: |
           # Extract CUDA major and minor versions
           CUDA_MAJOR=$(echo "${{ matrix.cuda }}" | cut -d'.' -f1)
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
old mode 100644
new mode 100755
diff --git a/README.md b/README.md
index 8f93c97f7a..b620c1481d 100644
--- a/README.md
+++ b/README.md
@@ -15,12 +15,12 @@ Kernel Library for LLM Serving
 [![Build Status](https://ci.tlcpack.ai/job/flashinfer-ci/job/main/badge/icon)](https://ci.tlcpack.ai/job/flashinfer-ci/job/main/)
 [![Documentation](https://github.com/flashinfer-ai/flashinfer/actions/workflows/build-doc.yml/badge.svg)](https://github.com/flashinfer-ai/flashinfer/actions/workflows/build-doc.yml)
 
-
 FlashInfer is a library and kernel generator for Large Language Models that provides high-performance implementation of LLM GPU kernels such as FlashAttention, SparseAttention, PageAttention, Sampling, and more. FlashInfer focuses on LLM serving and inference, and delivers state-of-the-art performance across diverse scenarios.
 
 Check our [v0.2 release blog](https://flashinfer.ai/2024/12/16/flashinfer-v02-release.html) for new features!
 
 The core features of FlashInfer include:
+
 1. **Efficient Sparse/Dense Attention Kernels**: Efficient single/batch attention for sparse(paged)/dense KV-storage on CUDA Cores and Tensor Cores (both FA2 & FA3) templates. The vector-sparse attention can achieve 90% of the bandwidth of dense kernels with same problem size.
 2. **Load-Balanced Scheduling**: FlashInfer decouples `plan`/`run` stage of attention computation where we schedule the computation of variable-length inputs in `plan` stage to alleviate load-imbalance issue.
 3. **Memory Efficiency**: FlashInfer offers [Cascade Attention](https://docs.flashinfer.ai/api/cascade.html#flashinfer.cascade.MultiLevelCascadeAttentionWrapper) for hierarchical KV-Cache, and implements Head-Query fusion for accelerating Grouped-Query Attention, and efficient kernels for low-precision attention and fused-RoPE attention for compressed KV-Cache.
@@ -31,6 +31,7 @@ The core features of FlashInfer include:
 FlashInfer supports PyTorch, TVM and C++ (header-only) APIs, and can be easily integrated into existing projects.
 
 ## News
+
 - [Mar 10, 2025] [Blog Post](https://flashinfer.ai/2025/03/10/sampling.html) Sorting-Free GPU Kernels for LLM Sampling, which explains the design of sampling kernels in FlashInfer.
 - [Mar 1, 2025] Checkout flashinfer's [intra-kernel profiler](https://github.com/flashinfer-ai/flashinfer/tree/main/profiler) for visualizing the timeline of each threadblock in GPU kernels.
 - [Dec 16, 2024] [Blog Post](https://flashinfer.ai/2024/12/16/flashinfer-v02-release.html) FlashInfer 0.2 - Efficient and Customizable Kernels for LLM Inference Serving
@@ -51,11 +52,13 @@ pip install flashinfer-python
 ```
 
 **Package Options:**
+
 - **flashinfer-python**: Core package that compiles/downloads kernels on first use
 - **flashinfer-cubin**: Pre-compiled kernel binaries for all supported GPU architectures
 - **flashinfer-jit-cache**: Pre-built kernel cache for specific CUDA versions
 
 **For faster initialization and offline usage**, install the optional packages to have most kernels pre-compiled:
+
 ```bash
 pip install flashinfer-python flashinfer-cubin
 # JIT cache package (replace cu129 with your CUDA version: cu128, cu129, or cu130)
@@ -75,6 +78,7 @@ python -m pip install -v .
 ```
 
 **For development**, install in editable mode:
+
 ```bash
 python -m pip install --no-build-isolation -e . -v
 ```
@@ -82,6 +86,7 @@ python -m pip install --no-build-isolation -e . -v
 **Build optional packages:**
 
 `flashinfer-cubin`:
+
 ```bash
 cd flashinfer-cubin
 python -m build --no-isolation --wheel
@@ -89,8 +94,9 @@ python -m pip install dist/*.whl
 ```
 
 `flashinfer-jit-cache` (customize `FLASHINFER_CUDA_ARCH_LIST` for your target GPUs):
+
 ```bash
-export FLASHINFER_CUDA_ARCH_LIST="7.5 8.0 8.9 10.0a 10.3a 12.0a"
+export FLASHINFER_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 10.3a 11.0a 12.0f"
 cd flashinfer-jit-cache
 python -m build --no-isolation --wheel
 python -m pip install dist/*.whl
@@ -120,6 +126,7 @@ flashinfer show-config
 ```
 
 This command displays:
+
 - FlashInfer version and installed packages (flashinfer-python, flashinfer-cubin, flashinfer-jit-cache)
 - PyTorch and CUDA version information
 - Environment variables and artifact paths
@@ -162,17 +169,36 @@ o = flashinfer.single_prefill_with_kv_cache(q, k, v, causal=False) # prefill att
 
 Check out [documentation](https://docs.flashinfer.ai/) for usage of batch decode/append/prefill kernels and shared-prefix cascading kernels.
 
+## API Logging
+
+FlashInfer provides comprehensive API logging for debugging. Enable it using environment variables:
+
+```bash
+# Enable logging (levels: 0=off (default), 1=basic, 3=detailed, 5=statistics)
+export FLASHINFER_LOGLEVEL=3
+
+# Set log destination (stdout (default), stderr, or file path)
+export FLASHINFER_LOGDEST=stdout
+```
+
+For detailed information about logging levels, configuration, and advanced features, see [Logging](https://docs.flashinfer.ai/logging.html) in our documentation.
+
 ## Custom Attention Variants
 
 Starting from FlashInfer v0.2, users can customize their own attention variants with additional parameters. For more details, refer to our [JIT examples](https://github.com/flashinfer-ai/flashinfer/blob/main/tests/utils/test_jit_example.py).
 
-## GPU Support
+## GPU and CUDA Support
 
 FlashInfer currently provides support for NVIDIA SM architectures 75 and higher and beta support for 103, 110, 120, and 121.
 
+**Supported CUDA Versions:** 12.6, 12.8, 13.0, 13.1
+
+> **Note:** FlashInfer strives to follow PyTorch's supported CUDA versions plus the latest CUDA release.
+
 ## Adoption
 
 We are thrilled to share that FlashInfer is being adopted by many cutting-edge projects, including but not limited to:
+
 - [MLC-LLM](https://github.com/mlc-ai/mlc-llm)
 - [Punica](https://github.com/punica-ai/punica)
 - [SGLang](https://github.com/sgl-project/sglang)
diff --git a/benchmarks/README.md b/benchmarks/README.md
index f41d695cdc..d81e9c3642 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -117,7 +117,7 @@ The output CSV will contain detailed metrics including:
 | `--verbose`, `-v`        | Print additional information (can be used multiple times for more verbosity, e.g. `-vv`)                   |
 | `--case_tag`              | Optional tag for the test case, useful for annotating or filtering results in the output CSV.              |
 | `--generate_repro_command`| If set, prints a reproducer command for the test case and stores it in the output CSV.                     |
-| `--backends`             | Space-separated list of backends to test, e.g. fa2, fa2_tc, fa3, cudnn, cutlass, trtllm, trtllm-gen, trtllm-gen-native, cublas|
+| `--backends`             | Space-separated list of backends to test, e.g. fa2, fa2_tc, fa3, cudnn, cutlass, trtllm, trtllm-gen, trtllm-native, cublas|
 
 ### Attention Flags
 | Flag                     | Description                                                                                                 |
@@ -166,8 +166,7 @@ The output CSV will contain detailed metrics including:
 | `--topk_group`           | Number of groups to consider for top-k routing. Default: 1                                                 |
 | `--routed_scaling_factor`| Scaling factor for routing. Default: 2.5                                                                   |
 | `--local_expert_offset`  | Offset of local experts in global expert space. Default: 0                                                 |
-| `--local_num_experts`    | Number of experts handled by this device. Default: equals num_experts                                      |
-| `--tile_tokens_dim`      | Tile dimension for tokens. Default: 8                                                                      |
+| `--local_num_experts`    | Number of experts handled by this device. Default: equals num_experts                                      |                                                                    |
 | `--routing_method`       | Routing method: `renormalize`, `deepseek_v3`, `llama4`, `renormalize_naive`. Default: `deepseek_v3`.       |
 | `--use_shuffled_weight`  | Whether to use shuffled weight layout                                                                      |
 | `--weight_layout`        | Weight layout: 0=MajorK, 1=MajorMn,  2=BlockMajorK. Default: 0                                             |
@@ -213,14 +212,14 @@ Legend:
 - cutlass: CUTLASS
 - trtllm: TensorRT-LLM
 - trtllm-gen: TensorRT-LLM (generic wrapper)
-- trtllm-gen-native: TensorRT-LLM (native API)
+- trtllm-native: TensorRT-LLM (native API)
 -->
 | Routine | 7.5 | 8.0 | 8.6 | 8.9 | 9.0 | 10.0 | 10.3 | 12.0 |
 |---------|-----|-----|-----|-----|-----|-------|-------|-------|
-| **BatchDecodeWithPagedKVCacheWrapper** | fa2 | fa2, fa2_tc, cudnn | fa2, fa2_tc, cudnn | fa2, fa2_tc, cudnn | fa2, fa2_tc, cudnn | fa2, fa2_tc, cudnn, trtllm-gen, trtllm-gen-native | fa2, fa2_tc, cudnn, trtllm-gen, trtllm-gen-native | fa2, fa2_tc, cudnn |
-| **BatchPrefillWithPagedKVCacheWrapper** |  | fa2, cudnn | fa2, cudnn | fa2, cudnn | fa2, fa3, cudnn | fa2, cudnn, trtllm-gen, trtllm-gen-native | fa2, cudnn, trtllm-gen, trtllm-gen-native | fa2, cudnn |
-| **BatchPrefillWithRaggedKVCacheWrapper** |  | fa2, cudnn | fa2, cudnn | fa2, cudnn | fa2, fa3, cudnn | fa2, cudnn, cutlass, trtllm-gen-native | fa2, cudnn, cutlass, trtllm-gen-native | fa2, cudnn |
-| **BatchMLAPagedAttentionWrapper** |  | fa2 | fa2 | fa2 | fa2, fa3 | fa2, cutlass, trtllm-gen-native | fa2, cutlass, trtllm-gen-native | fa2 |
+| **BatchDecodeWithPagedKVCacheWrapper** | fa2 | fa2, fa2_tc, cudnn | fa2, fa2_tc, cudnn | fa2, fa2_tc, cudnn | fa2, fa2_tc, cudnn | fa2, fa2_tc, cudnn, trtllm-gen, trtllm-native | fa2, fa2_tc, cudnn, trtllm-gen, trtllm-native | fa2, fa2_tc, cudnn |
+| **BatchPrefillWithPagedKVCacheWrapper** |  | fa2, cudnn | fa2, cudnn | fa2, cudnn | fa2, fa3, cudnn | fa2, cudnn, trtllm-gen, trtllm-native | fa2, cudnn, trtllm-gen, trtllm-native | fa2, cudnn |
+| **BatchPrefillWithRaggedKVCacheWrapper** |  | fa2, cudnn | fa2, cudnn | fa2, cudnn | fa2, fa3, cudnn | fa2, cudnn, cutlass, trtllm-native | fa2, cudnn, cutlass, trtllm-native | fa2, cudnn |
+| **BatchMLAPagedAttentionWrapper** |  | fa2 | fa2 | fa2 | fa2, fa3 | fa2, cutlass, trtllm-native | fa2, cutlass, trtllm-native | fa2 |
 | **gemm_fp8_nt_groupwise** |  |  |  |  |  | cutlass | cutlass |  |
 | **group_gemm_fp8_nt_groupwise** |  |  |  |  |  | cutlass | cutlass |  |
 | **bmm_fp8** |  |  |  | cudnn, cublas | cudnn, cublas | cudnn, cublas, cutlass | cudnn, cublas, cutlass | cudnn, cublas |
@@ -238,4 +237,4 @@ Backend Legend:
 - cutlass: CUTLASS
 - trtllm: TensorRT-LLM
 - trtllm-gen: TensorRT-LLM
-- trtllm-gen-native: TensorRT-LLM (out-of-wrapper)
+- trtllm-native: TensorRT-LLM (out-of-wrapper)
diff --git a/benchmarks/bench_batch_attention.py b/benchmarks/bench_batch_attention.py
index 2c1071d808..c94a86eacc 100644
--- a/benchmarks/bench_batch_attention.py
+++ b/benchmarks/bench_batch_attention.py
@@ -436,7 +436,7 @@ def main(args: argparse.Namespace) -> None:
     records_new = []
     records_separate = []
     for cfg_id, (decode_case, prefill_case) in enumerate(
-        zip(decode_lens, prefill_lens), start=1
+        zip(decode_lens, prefill_lens, strict=True), start=1
     ):
         prefill_kv_lens = [p[0] for p in prefill_case]
         prefill_qo_lens = [p[1] for p in prefill_case]
diff --git a/benchmarks/bench_blackwell_attention.py b/benchmarks/bench_blackwell_attention.py
index 52452e05a8..73b0cd0b3c 100644
--- a/benchmarks/bench_blackwell_attention.py
+++ b/benchmarks/bench_blackwell_attention.py
@@ -18,7 +18,10 @@
 import torch
 
 import flashinfer
-from flashinfer.testing.utils import bench_gpu_time
+from flashinfer.testing.utils import (
+    bench_gpu_time,
+    attention_tflops_per_sec_with_actual_seq_lens,
+)
 
 
 def bench_fmha_blackwell(
@@ -69,14 +72,17 @@ def bench_fmha_blackwell(
     )
     ms = np.median(measurements)
 
-    def flops(ms):
-        if causal:
-            return batch_size * qkv_len * qkv_len * num_heads * head_dim * 2 / ms / 1e9
-        else:
-            return batch_size * qkv_len * qkv_len * num_heads * head_dim * 4 / ms / 1e9
-
+    TFLOPS = attention_tflops_per_sec_with_actual_seq_lens(
+        torch.full((batch_size,), qkv_len),
+        torch.full((batch_size,), qkv_len),
+        head_dim,
+        head_dim,
+        num_heads,
+        causal,
+        ms,
+    )
     print(
-        f"bench_fmha_blackwell (batch_size={batch_size}, qkv_len={qkv_len}, num_heads={num_heads}, head_dim={head_dim}, causal={causal}), flops: {flops(ms):.3f} TFLOPs/s"
+        f"bench_fmha_blackwell (batch_size={batch_size}, qkv_len={qkv_len}, num_heads={num_heads}, head_dim={head_dim}, causal={causal}), flops: {TFLOPS:.3f} TFLOPs/s"
     )
 
 
diff --git a/benchmarks/bench_block_sparse_attention.py b/benchmarks/bench_block_sparse_attention.py
index e2a51012f5..2da2478a6f 100644
--- a/benchmarks/bench_block_sparse_attention.py
+++ b/benchmarks/bench_block_sparse_attention.py
@@ -18,7 +18,10 @@
 import torch
 
 import flashinfer
-from flashinfer.testing.utils import bench_gpu_time
+from flashinfer.testing.utils import (
+    bench_gpu_time,
+    attention_tflops_per_sec_with_actual_seq_lens,
+)
 
 
 def bench_variable_block_sparse_attention(
@@ -120,7 +123,15 @@ def bench_variable_block_sparse_attention(
     )
 
     def flops(ms):
-        return seq_len * seq_len * num_qo_heads * head_dim * 4 / ms / 1e9
+        return attention_tflops_per_sec_with_actual_seq_lens(
+            torch.tensor([seq_len]),
+            torch.tensor([seq_len]),
+            head_dim,
+            head_dim,
+            num_qo_heads,
+            False,
+            ms,
+        )
 
     print(
         f"bench_variable_block_sparse_attention (num_qo_heads={num_qo_heads}, num_kv_heads={num_kv_heads}, head_dim={head_dim}, seq_len={seq_len}, num_blocks_row={num_blocks_row}, num_blocks_col={num_blocks_col}, block_density={block_density}), sparse fa2-template: {flops(sparse_ms_fa2):.3f} TFLOPs/s, sparse fa3-template: {flops(sparse_ms_fa3):.3f} TFLOPs/s, dense fa2-template: {flops(dense_sm80_ms):.3f} TFLOPs/s, dense fa3-template: {flops(dense_sm90_ms):.3f} TFLOPs/s"
diff --git a/benchmarks/bench_hopper_attention.py b/benchmarks/bench_hopper_attention.py
index 6ad2fdaa1b..c1e56e6225 100644
--- a/benchmarks/bench_hopper_attention.py
+++ b/benchmarks/bench_hopper_attention.py
@@ -18,7 +18,10 @@
 import torch
 
 import flashinfer
-from flashinfer.testing.utils import bench_gpu_time
+from flashinfer.testing.utils import (
+    bench_gpu_time,
+    attention_tflops_per_sec_with_actual_seq_lens,
+)
 
 
 def bench_single_prefill(seq_len, num_heads, causal, head_dim):
@@ -41,10 +44,15 @@ def bench_single_prefill(seq_len, num_heads, causal, head_dim):
     )
 
     def flops(ms):
-        if causal:
-            return seq_len * seq_len * num_qo_heads * head_dim * 2 / ms / 1e9
-        else:
-            return seq_len * seq_len * num_qo_heads * head_dim * 4 / ms / 1e9
+        return attention_tflops_per_sec_with_actual_seq_lens(
+            torch.tensor([seq_len]),
+            torch.tensor([seq_len]),
+            head_dim,
+            head_dim,
+            num_qo_heads,
+            causal,
+            ms,
+        )
 
     print(
         f"bench_single_prefill (seq_len={seq_len}, num_heads={num_heads}, causal={causal}, head_dim={head_dim}), fa2-template: {flops(sm80_ms):.3f} TFLOPs/s, fa3-template: {flops(sm90_ms):.3f} TFLOPs/s"
@@ -97,14 +105,15 @@ def bench_batch_ragged_prefill(batch_size, num_heads, seq_len, causal, head_dim)
     )
 
     def flops(ms):
-        if causal:
-            return (
-                batch_size * seq_len * seq_len * num_qo_heads * head_dim * 2 / ms / 1e9
-            )
-        else:
-            return (
-                batch_size * seq_len * seq_len * num_qo_heads * head_dim * 4 / ms / 1e9
-            )
+        return attention_tflops_per_sec_with_actual_seq_lens(
+            torch.full((batch_size,), seq_len),
+            torch.full((batch_size,), seq_len),
+            head_dim,
+            head_dim,
+            num_qo_heads,
+            causal,
+            ms,
+        )
 
     print(
         f"bench_batch_ragged_prefill (batch_size={batch_size}, num_heads={num_heads}, seq_len={seq_len}, causal={causal}, head_dim={head_dim}), fa2-template: {flops(sm80_ms):.3f} TFLOPs/s, fa3-template: {flops(sm90_ms):.3f} TFLOPs/s"
@@ -176,14 +185,15 @@ def bench_batch_paged_prefill(
     )
 
     def flops(ms):
-        if causal:
-            return (
-                batch_size * seq_len * seq_len * num_qo_heads * head_dim * 2 / ms / 1e9
-            )
-        else:
-            return (
-                batch_size * seq_len * seq_len * num_qo_heads * head_dim * 4 / ms / 1e9
-            )
+        return attention_tflops_per_sec_with_actual_seq_lens(
+            torch.full((batch_size,), seq_len),
+            torch.full((batch_size,), seq_len),
+            head_dim,
+            head_dim,
+            num_qo_heads,
+            causal,
+            ms,
+        )
 
     print(
         f"bench_batch_paged_prefill (page_size={page_size} batch_size={batch_size}, num_heads={num_heads}, seq_len={seq_len}, causal={causal}, head_dim={head_dim}), fa2-template: {flops(sm80_ms):.3f} TFLOPs/s, fa3-template: {flops(sm90_ms):.3f} TFLOPs/s"
diff --git a/benchmarks/bench_hopper_fp8_attention.py b/benchmarks/bench_hopper_fp8_attention.py
index 34d71d7f9e..75b02024d6 100644
--- a/benchmarks/bench_hopper_fp8_attention.py
+++ b/benchmarks/bench_hopper_fp8_attention.py
@@ -1,43 +1,88 @@
+"""
+Copyright (c) 2024 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
 import numpy as np
 import torch
 
 import flashinfer
-from flashinfer.testing.utils import bench_gpu_time
+from flashinfer.testing.utils import (
+    bench_gpu_time,
+    attention_tflops_per_sec_with_actual_seq_lens,
+)
 
 
-def bench_single_prefill(seq_len, num_heads, causal, head_dim):
-    num_qo_heads = num_kv_heads = num_heads
-    q = torch.randn(seq_len, num_qo_heads, head_dim, dtype=torch.half, device="cuda")
-    k = torch.randn(seq_len, num_kv_heads, head_dim, dtype=torch.half, device="cuda")
-    v = torch.randn(seq_len, num_kv_heads, head_dim, dtype=torch.half, device="cuda")
-
-    sm80_ms, sm90_ms = (
-        np.median(
-            bench_gpu_time(
-                lambda: flashinfer.single_prefill_with_kv_cache_return_lse(
-                    q, k, v, causal=causal, backend=backend
-                ),
-                dry_run_time_ms=100,
-                repeat_time_ms=1000,
-            )
-        )
-        for backend in ["fa2", "fa3"]
+def per_head_symmetric_quant(x, quant_dtype):
+    """Per-head symmetric quantization to FP8."""
+    o_min_val, o_max_val = (
+        (-448.0, 448.0) if quant_dtype == torch.float8_e4m3fn else (-57344, 57344)
     )
+    x_max_val = x.abs().amax(dim=(0, 2)).to(dtype=torch.float32)
+    s_out = torch.clamp(x_max_val / o_max_val, min=1e-6)
+    s_out_broadcast = s_out.view(1, -1, 1)
+    q_x_out = torch.clamp(x / s_out_broadcast, min=o_min_val, max=o_max_val).to(
+        dtype=quant_dtype
+    )
+    return q_x_out, s_out
+
+
+def bench_fp8_single_prefill(
+    seq_len, num_heads, causal, head_dim, dtype=torch.float8_e4m3fn
+):
+    """Benchmark FP8 single prefill attention."""
+    num_qo_heads = num_kv_heads = num_heads
 
-    q = torch.randn(
+    # Create FP16 tensors first, then quantize
+    q_fp16 = torch.randn(
         seq_len, num_qo_heads, head_dim, dtype=torch.half, device="cuda"
-    ).to(dtype=torch.float8_e4m3fn)
-    k = torch.randn(
+    )
+    k_fp16 = torch.randn(
         seq_len, num_kv_heads, head_dim, dtype=torch.half, device="cuda"
-    ).to(dtype=torch.float8_e4m3fn)
-    v = torch.randn(
+    )
+    v_fp16 = torch.randn(
         seq_len, num_kv_heads, head_dim, dtype=torch.half, device="cuda"
-    ).to(dtype=torch.float8_e4m3fn)
+    )
+
+    # Quantize to FP8
+    q_fp8, s_q = per_head_symmetric_quant(q_fp16, dtype)
+    k_fp8, s_k = per_head_symmetric_quant(k_fp16, dtype)
+    v_fp8, s_v = per_head_symmetric_quant(v_fp16, dtype)
+
+    # FP16 baseline (fa3)
+    fp16_ms = np.median(
+        bench_gpu_time(
+            lambda: flashinfer.single_prefill_with_kv_cache_return_lse(
+                q_fp16, k_fp16, v_fp16, causal=causal, backend="fa3"
+            ),
+            dry_run_time_ms=100,
+            repeat_time_ms=1000,
+        )
+    )
 
-    fp8_sm90_ms = np.median(
+    # FP8 (fa3)
+    fp8_ms = np.median(
         bench_gpu_time(
             lambda: flashinfer.single_prefill_with_kv_cache_return_lse(
-                q, k, v, causal=causal, backend="fa3", o_dtype=torch.half
+                q_fp8,
+                k_fp8,
+                v_fp8,
+                causal=causal,
+                backend="fa3",
+                scale_q=s_q,
+                scale_k=s_k,
+                scale_v=s_v,
             ),
             dry_run_time_ms=100,
             repeat_time_ms=1000,
@@ -45,13 +90,233 @@ def bench_single_prefill(seq_len, num_heads, causal, head_dim):
     )
 
     def flops(ms):
-        if causal:
-            return seq_len * seq_len * num_qo_heads * head_dim * 2 / ms / 1e9
-        else:
-            return seq_len * seq_len * num_qo_heads * head_dim * 4 / ms / 1e9
+        return attention_tflops_per_sec_with_actual_seq_lens(
+            torch.tensor([seq_len]),
+            torch.tensor([seq_len]),
+            head_dim,
+            head_dim,
+            num_qo_heads,
+            causal,
+            ms,
+        )
 
     print(
-        f"bench_single_prefill (seq_len={seq_len}, num_heads={num_heads}, causal={causal}, head_dim={head_dim}), fa2-template: {flops(sm80_ms):.3f} TFLOPs/s, fa3-template: {flops(sm90_ms):.3f} TFLOPs/s, fa3-fp8: {flops(fp8_sm90_ms):.3f} TFLOPs/s"
+        f"bench_fp8_single_prefill (seq_len={seq_len}, num_heads={num_heads}, causal={causal}, head_dim={head_dim}), "
+        f"fp16: {flops(fp16_ms):.3f} TFLOPs/s ({fp16_ms:.3f}ms), "
+        f"fp8: {flops(fp8_ms):.3f} TFLOPs/s ({fp8_ms:.3f}ms), "
+        f"speedup: {fp16_ms / fp8_ms:.2f}x"
+    )
+
+
+def bench_fp8_batch_ragged_prefill(
+    batch_size, num_heads, seq_len, causal, head_dim, dtype=torch.float8_e4m3fn
+):
+    """Benchmark FP8 batch ragged prefill attention."""
+    num_qo_heads = num_kv_heads = num_heads
+    total_len = batch_size * seq_len
+
+    # Create FP16 tensors first
+    q_fp16 = torch.randn(
+        total_len, num_qo_heads, head_dim, dtype=torch.half, device="cuda"
+    )
+    k_fp16 = torch.randn(
+        total_len, num_kv_heads, head_dim, dtype=torch.half, device="cuda"
+    )
+    v_fp16 = torch.randn(
+        total_len, num_kv_heads, head_dim, dtype=torch.half, device="cuda"
+    )
+
+    # Quantize to FP8
+    q_fp8, s_q = per_head_symmetric_quant(q_fp16, dtype)
+    k_fp8, s_k = per_head_symmetric_quant(k_fp16, dtype)
+    v_fp8, s_v = per_head_symmetric_quant(v_fp16, dtype)
+
+    qo_indptr = torch.arange(
+        0, total_len + 1, seq_len, dtype=torch.int32, device="cuda"
+    )
+    kv_indptr = torch.arange(
+        0, total_len + 1, seq_len, dtype=torch.int32, device="cuda"
+    )
+
+    # FP16 wrapper
+    fp16_wrapper = flashinfer.BatchPrefillWithRaggedKVCacheWrapper(
+        torch.empty(256 * 1024 * 1024, dtype=torch.uint8, device="cuda"),
+        kv_layout="NHD",
+        backend="fa3",
+    )
+    fp16_wrapper.plan(
+        qo_indptr, kv_indptr, num_qo_heads, num_kv_heads, head_dim, causal=causal
+    )
+
+    # FP8 wrapper
+    fp8_wrapper = flashinfer.BatchPrefillWithRaggedKVCacheWrapper(
+        torch.empty(256 * 1024 * 1024, dtype=torch.uint8, device="cuda"),
+        kv_layout="NHD",
+        backend="fa3",
+    )
+    fp8_wrapper.plan(
+        qo_indptr,
+        kv_indptr,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+        o_data_type=torch.half,
+        causal=causal,
+    )
+
+    fp16_ms = np.median(
+        bench_gpu_time(
+            lambda: fp16_wrapper.run(q_fp16, k_fp16, v_fp16),
+            dry_run_time_ms=100,
+            repeat_time_ms=1000,
+        )
+    )
+
+    fp8_ms = np.median(
+        bench_gpu_time(
+            lambda: fp8_wrapper.run(q_fp8, k_fp8, v_fp8, s_q, s_k, s_v),
+            dry_run_time_ms=100,
+            repeat_time_ms=1000,
+        )
+    )
+
+    def flops(ms):
+        return attention_tflops_per_sec_with_actual_seq_lens(
+            torch.full((batch_size,), seq_len),
+            torch.full((batch_size,), seq_len),
+            head_dim,
+            head_dim,
+            num_qo_heads,
+            causal,
+            ms,
+        )
+
+    print(
+        f"bench_fp8_batch_ragged_prefill (batch_size={batch_size}, num_heads={num_heads}, seq_len={seq_len}, causal={causal}, head_dim={head_dim}), "
+        f"fp16: {flops(fp16_ms):.3f} TFLOPs/s ({fp16_ms:.3f}ms), "
+        f"fp8: {flops(fp8_ms):.3f} TFLOPs/s ({fp8_ms:.3f}ms), "
+        f"speedup: {fp16_ms / fp8_ms:.2f}x"
+    )
+
+
+def bench_fp8_batch_paged_prefill(
+    page_size,
+    batch_size,
+    num_heads,
+    seq_len,
+    causal,
+    head_dim,
+    dtype=torch.float8_e4m3fn,
+):
+    """Benchmark FP8 batch paged prefill attention."""
+    num_qo_heads = num_kv_heads = num_heads
+    total_qo_len = batch_size * seq_len
+    num_pages = batch_size * seq_len // page_size
+
+    # Create FP16 tensors first
+    q_fp16 = torch.randn(
+        total_qo_len, num_qo_heads, head_dim, dtype=torch.half, device="cuda"
+    )
+    # Paged KV cache: (num_pages, page_size, num_heads, head_dim)
+    k_fp16 = torch.randn(
+        num_pages, page_size, num_kv_heads, head_dim, dtype=torch.half, device="cuda"
+    )
+    v_fp16 = torch.randn(
+        num_pages, page_size, num_kv_heads, head_dim, dtype=torch.half, device="cuda"
+    )
+
+    # Quantize to FP8
+    q_fp8, s_q = per_head_symmetric_quant(q_fp16, dtype)
+    # For paged KV, reshape to (total_tokens, num_heads, head_dim) for quantization
+    k_flat = k_fp16.view(-1, num_kv_heads, head_dim)
+    v_flat = v_fp16.view(-1, num_kv_heads, head_dim)
+    k_fp8_flat, s_k = per_head_symmetric_quant(k_flat, dtype)
+    v_fp8_flat, s_v = per_head_symmetric_quant(v_flat, dtype)
+    k_fp8 = k_fp8_flat.view(num_pages, page_size, num_kv_heads, head_dim)
+    v_fp8 = v_fp8_flat.view(num_pages, page_size, num_kv_heads, head_dim)
+
+    qo_indptr = torch.arange(
+        0, total_qo_len + 1, seq_len, dtype=torch.int32, device="cuda"
+    )
+    kv_indptr = torch.arange(
+        0, num_pages + 1, seq_len // page_size, dtype=torch.int32, device="cuda"
+    )
+    kv_indices = torch.arange(0, num_pages, dtype=torch.int32, device="cuda")
+    last_page_len = torch.ones(batch_size, dtype=torch.int32, device="cuda") * page_size
+
+    # FP16 wrapper
+    fp16_wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+        torch.empty(256 * 1024 * 1024, dtype=torch.uint8, device="cuda"),
+        kv_layout="NHD",
+        backend="fa3",
+    )
+    fp16_wrapper.plan(
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        last_page_len,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        page_size,
+        causal=causal,
+    )
+
+    # FP8 wrapper
+    fp8_wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+        torch.empty(256 * 1024 * 1024, dtype=torch.uint8, device="cuda"),
+        kv_layout="NHD",
+        backend="fa3",
+    )
+    fp8_wrapper.plan(
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        last_page_len,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        page_size,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+        o_data_type=torch.half,
+        causal=causal,
+    )
+
+    fp16_ms = np.median(
+        bench_gpu_time(
+            lambda: fp16_wrapper.run(q_fp16, (k_fp16, v_fp16)),
+            dry_run_time_ms=100,
+            repeat_time_ms=1000,
+        )
+    )
+
+    fp8_ms = np.median(
+        bench_gpu_time(
+            lambda: fp8_wrapper.run(q_fp8, (k_fp8, v_fp8), s_q, s_k, s_v),
+            dry_run_time_ms=100,
+            repeat_time_ms=1000,
+        )
+    )
+
+    def flops(ms):
+        return attention_tflops_per_sec_with_actual_seq_lens(
+            torch.full((batch_size,), seq_len),
+            torch.full((batch_size,), seq_len),
+            head_dim,
+            head_dim,
+            num_qo_heads,
+            causal,
+            ms,
+        )
+
+    print(
+        f"bench_fp8_batch_paged_prefill (page_size={page_size}, batch_size={batch_size}, num_heads={num_heads}, seq_len={seq_len}, causal={causal}, head_dim={head_dim}), "
+        f"fp16: {flops(fp16_ms):.3f} TFLOPs/s ({fp16_ms:.3f}ms), "
+        f"fp8: {flops(fp8_ms):.3f} TFLOPs/s ({fp8_ms:.3f}ms), "
+        f"speedup: {fp16_ms / fp8_ms:.2f}x"
     )
 
 
@@ -62,8 +327,30 @@ def flops(ms):
         print("Current benchmark targets capability (9, 0). Returning...")
         exit()
 
-    for seq_len in [4096, 8192, 16384]:
-        for num_heads in [24, 32]:
-            for causal in [True, False]:
-                for head_dim in [64, 128, 256]:
-                    bench_single_prefill(seq_len, num_heads, causal, head_dim)
+    # Skip single prefill for now due to compilation issues
+    # print("=" * 80)
+    # print("FP8 Single Prefill Benchmarks")
+    # print("=" * 80)
+    # for head_dim in [128, 256]:
+    #     for seq_len in [1024, 4096, 8192]:
+    #         bench_fp8_single_prefill(seq_len, 32, True, head_dim)
+
+    print()
+    print("=" * 80)
+    print("FP8 Batch Ragged Prefill Benchmarks")
+    print("=" * 80)
+    for head_dim in [128, 256]:
+        bench_fp8_batch_ragged_prefill(128, 32, 1024, True, head_dim)
+        bench_fp8_batch_ragged_prefill(64, 32, 2048, True, head_dim)
+        bench_fp8_batch_ragged_prefill(32, 32, 4096, True, head_dim)
+        bench_fp8_batch_ragged_prefill(16, 32, 8192, True, head_dim)
+
+    print()
+    print("=" * 80)
+    print("FP8 Batch Paged Prefill Benchmarks")
+    print("=" * 80)
+    for head_dim in [128, 256]:
+        bench_fp8_batch_paged_prefill(16, 128, 32, 1024, True, head_dim)
+        bench_fp8_batch_paged_prefill(16, 64, 32, 2048, True, head_dim)
+        bench_fp8_batch_paged_prefill(16, 32, 32, 4096, True, head_dim)
+        bench_fp8_batch_paged_prefill(16, 16, 32, 8192, True, head_dim)
diff --git a/benchmarks/bench_logging_overhead.py b/benchmarks/bench_logging_overhead.py
new file mode 100644
index 0000000000..e67edcfa45
--- /dev/null
+++ b/benchmarks/bench_logging_overhead.py
@@ -0,0 +1,333 @@
+#!/usr/bin/env python3
+"""
+Benchmark script to measure the overhead of API logging at different levels.
+
+This script creates decorated and undecorated versions of a test function
+(torch.matmul) and compares their performance to accurately measure logging overhead.
+
+Usage:
+    # Set the logging level before running
+    export FLASHINFER_LOGLEVEL=3
+    python bench_logging_overhead.py
+
+    # Or run with different levels
+    FLASHINFER_LOGLEVEL=0 python bench_logging_overhead.py
+    FLASHINFER_LOGLEVEL=1 python bench_logging_overhead.py
+    FLASHINFER_LOGLEVEL=3 python bench_logging_overhead.py
+    FLASHINFER_LOGLEVEL=5 python bench_logging_overhead.py
+
+    # Or use the helper script to run all levels
+    bash benchmark_all_levels.sh
+"""
+
+import os
+import sys
+import time
+import torch
+import numpy as np
+from typing import List, Tuple
+
+# Get logging level BEFORE importing flashinfer
+LOGGING_LEVEL = int(os.environ.get("FLASHINFER_LOGLEVEL", "0"))
+LOG_DEST = os.environ.get("FLASHINFER_LOGDEST", "/tmp/flashinfer_benchmark_log.txt")
+
+# Import the decorator
+from flashinfer.api_logging import flashinfer_api
+
+
+# Create two versions of a test function:
+# 1. Undecorated (baseline)
+# 2. Decorated (with logging)
+def test_matmul_undecorated(A, B):
+    return torch.matmul(A, B)
+
+
+@flashinfer_api
+def test_matmul_decorated(A, B):
+    return torch.matmul(A, B)
+
+
+class BenchmarkResults:
+    """Store and display benchmark results."""
+
+    def __init__(self):
+        self.undecorated_times = []
+        self.decorated_times = []
+
+    def set_undecorated(self, times: List[float]):
+        """Set benchmark results for undecorated function."""
+        self.undecorated_times = times
+
+    def set_decorated(self, times: List[float]):
+        """Set benchmark results for decorated function."""
+        self.decorated_times = times
+
+    def print_summary(self, logging_level: int):
+        """Print a summary of benchmark results."""
+        print("\n" + "=" * 80)
+        print("BENCHMARK RESULTS")
+        print("=" * 80)
+
+        undecorated_mean = np.mean(self.undecorated_times)
+        undecorated_std = np.std(self.undecorated_times)
+
+        decorated_mean = np.mean(self.decorated_times)
+        decorated_std = np.std(self.decorated_times)
+
+        overhead_abs = (decorated_mean - undecorated_mean) * 1000  # ms
+        overhead_pct = (
+            ((decorated_mean - undecorated_mean) / undecorated_mean * 100)
+            if undecorated_mean > 0
+            else 0
+        )
+
+        print(
+            f"\n{'Version':<20} {'Mean (ms)':<12} {'Std (ms)':<12} {'Median (ms)':<12}"
+        )
+        print("-" * 80)
+        print(
+            f"{'Undecorated':<20} {undecorated_mean * 1000:<12.4f} {undecorated_std * 1000:<12.4f} {np.median(self.undecorated_times) * 1000:<12.4f}"
+        )
+        print(
+            f"{'Decorated':<20} {decorated_mean * 1000:<12.4f} {decorated_std * 1000:<12.4f} {np.median(self.decorated_times) * 1000:<12.4f}"
+        )
+
+        print("\n" + "=" * 80)
+        print("OVERHEAD ANALYSIS")
+        print("=" * 80)
+        print(f"\nLogging Level: {logging_level}")
+        print(f"Absolute overhead: {overhead_abs:.4f} ms")
+        print(f"Relative overhead: {overhead_pct:.2f}%")
+
+        print("\n" + "=" * 80)
+        print("DETAILED STATISTICS")
+        print("=" * 80)
+
+        print("\nUndecorated (baseline):")
+        print(f"  Mean:   {undecorated_mean * 1000:.4f} ms")
+        print(f"  Median: {np.median(self.undecorated_times) * 1000:.4f} ms")
+        print(f"  Std:    {undecorated_std * 1000:.4f} ms")
+        print(f"  Min:    {np.min(self.undecorated_times) * 1000:.4f} ms")
+        print(f"  Max:    {np.max(self.undecorated_times) * 1000:.4f} ms")
+
+        print("\nDecorated (with logging):")
+        print(f"  Mean:   {decorated_mean * 1000:.4f} ms")
+        print(f"  Median: {np.median(self.decorated_times) * 1000:.4f} ms")
+        print(f"  Std:    {decorated_std * 1000:.4f} ms")
+        print(f"  Min:    {np.min(self.decorated_times) * 1000:.4f} ms")
+        print(f"  Max:    {np.max(self.decorated_times) * 1000:.4f} ms")
+
+
+def setup_test_inputs(
+    batch_size: int = 32,
+    m: int = 512,
+    n: int = 512,
+    k: int = 512,
+    device: str = "cuda:0",
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Set up test inputs for matmul.
+
+    Parameters
+    ----------
+    batch_size : int
+        Batch size for the matrix multiplication
+    m, n, k : int
+        Matrix dimensions
+    device : str
+        Device to use
+
+    Returns
+    -------
+    A, B : torch.Tensor
+        Input tensors for matrix multiplication
+    """
+    # Create random tensors
+    A = torch.randn(batch_size, m, k, dtype=torch.float16, device=device)
+    B = torch.randn(batch_size, k, n, dtype=torch.float16, device=device)
+
+    return A, B
+
+
+def warmup(func, A, B, num_warmup: int = 10):
+    """Warmup the GPU and JIT compilation."""
+    for _ in range(num_warmup):
+        _ = func(A, B)
+    torch.cuda.synchronize()
+
+
+def benchmark_function(
+    func, func_name: str, A, B, num_iterations: int = 100
+) -> List[float]:
+    """
+    Benchmark a specific function.
+
+    Parameters
+    ----------
+    func : callable
+        Function to benchmark
+    func_name : str
+        Name of the function (for display)
+    A, B : torch.Tensor
+        Input tensors for matrix multiplication
+    num_iterations : int
+        Number of iterations to run
+
+    Returns
+    -------
+    List[float]
+        List of execution times in seconds
+    """
+    print(f"\nBenchmarking: {func_name}")
+    print(f"  Running {num_iterations} iterations...")
+
+    times = []
+
+    for _ in range(num_iterations):
+        # Synchronize before timing
+        torch.cuda.synchronize()
+
+        # Time the execution
+        start = time.perf_counter()
+        _ = func(A, B)
+        torch.cuda.synchronize()
+        end = time.perf_counter()
+
+        elapsed = end - start
+        times.append(elapsed)
+
+    print(f"  Complete. Mean time: {np.mean(times) * 1000:.4f} ms")
+
+    return times
+
+
+def main():
+    """Main benchmark function."""
+    print("=" * 80)
+    print("FlashInfer API Logging Overhead Benchmark")
+    print("=" * 80)
+
+    # Display logging configuration
+    print("\nLogging Configuration:")
+    print(f"  FLASHINFER_LOGLEVEL = {LOGGING_LEVEL}")
+    print(f"  FLASHINFER_LOGDEST = {LOG_DEST}")
+
+    # Get level name
+    level_names = {
+        0: "No logging (zero-overhead)",
+        1: "Function name only",
+        3: "Name + inputs/outputs + metadata",
+        5: "Name + inputs/outputs + metadata + statistics",
+    }
+    print(f"  Level description: {level_names.get(LOGGING_LEVEL, 'Unknown')}")
+
+    # Check if CUDA is available
+    if not torch.cuda.is_available():
+        print("\nError: CUDA is not available. This benchmark requires a CUDA device.")
+        exit(1)
+
+    device = "cuda:0"
+    print(f"\nDevice: {device}")
+    print(f"Device Name: {torch.cuda.get_device_name(device)}")
+
+    # Setup test inputs
+    print("\nSetting up test inputs...")
+    batch_size = 32
+    m, n, k = 128, 128, 128
+    print(f"  Batch size: {batch_size}")
+    print(f"  Matrix dimensions: [{batch_size}, {m}, {k}] @ [{batch_size}, {k}, {n}]")
+
+    A, B = setup_test_inputs(batch_size, m, n, k, device)
+
+    # Benchmark parameters
+    num_iterations = 100
+    print("\nBenchmark parameters:")
+    print(f"  Iterations: {num_iterations}")
+    print("  Warmup iterations: 10")
+
+    # Clear log file before starting
+    if os.path.exists(LOG_DEST):
+        os.remove(LOG_DEST)
+
+    print("\n" + "=" * 80)
+    print("WARMUP PHASE")
+    print("=" * 80)
+
+    # Warmup undecorated version
+    print("\nWarming up undecorated version...")
+    warmup(test_matmul_undecorated, A, B, num_warmup=10)
+    print("  Complete.")
+
+    # Warmup decorated version
+    print("\nWarming up decorated version...")
+    warmup(test_matmul_decorated, A, B, num_warmup=10)
+    print("  Complete.")
+
+    print("\n" + "=" * 80)
+    print("BENCHMARK PHASE")
+    print("=" * 80)
+
+    # Store results
+    results = BenchmarkResults()
+
+    # Benchmark undecorated version
+    undecorated_times = benchmark_function(
+        test_matmul_undecorated, "Undecorated (baseline)", A, B, num_iterations
+    )
+    results.set_undecorated(undecorated_times)
+
+    # Benchmark decorated version
+    decorated_times = benchmark_function(
+        test_matmul_decorated,
+        f"Decorated (logging level {LOGGING_LEVEL})",
+        A,
+        B,
+        num_iterations,
+    )
+    results.set_decorated(decorated_times)
+
+    # Print summary
+    results.print_summary(LOGGING_LEVEL)
+
+    # Check log file size
+    if LOGGING_LEVEL > 0 and os.path.exists(LOG_DEST):
+        log_size = os.path.getsize(LOG_DEST)
+        print("\n" + "=" * 80)
+        print("LOG FILE INFO")
+        print("=" * 80)
+        print(f"Log file: {LOG_DEST}")
+        print(f"Log size: {log_size / 1024:.2f} KB ({log_size} bytes)")
+        print(f"Iterations logged: {num_iterations}")
+        print(f"Bytes per iteration: {log_size / num_iterations:.2f}")
+
+        # Cleanup option
+        cleanup_log = os.environ.get("CLEANUP_LOG", "true").lower() == "true"
+        if cleanup_log:
+            os.remove(LOG_DEST)
+            print("\n Log file removed (set CLEANUP_LOG=false to keep it)")
+        else:
+            print(f"\n Log file preserved at {LOG_DEST}")
+
+    print("\n" + "=" * 80)
+    print("RECOMMENDATIONS")
+    print("=" * 80)
+    print("\nTo benchmark other levels, run:")
+    for level in [0, 1, 3, 5]:
+        if level != LOGGING_LEVEL:
+            print(f"  FLASHINFER_LOGLEVEL={level} python {sys.argv[0]}")
+
+    print("\n" + "=" * 80)
+    print("Benchmark complete!")
+    print("=" * 80)
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("\n\nBenchmark interrupted by user.")
+    except Exception as e:
+        print(f"\n\nError during benchmark: {e}")
+        import traceback
+
+        traceback.print_exc()
diff --git a/benchmarks/bench_mixed_attention.py b/benchmarks/bench_mixed_attention.py
index 85753a71f9..7414a58af0 100644
--- a/benchmarks/bench_mixed_attention.py
+++ b/benchmarks/bench_mixed_attention.py
@@ -23,7 +23,10 @@ def run_bench(
     q_lens = torch.tensor(d_qo_lens + p_qo_lens, dtype=torch.int32)
 
     seq_lens_blocks = torch.ceil(seq_lens / page_block_size).int()
-    d_seq_lens_blocks = (
+    p_seq_lens_blocks = torch.ceil(
+        torch.tensor(p_kv_lens, dtype=torch.int32) / page_block_size
+    ).int()
+    d_seq_lens_blocks = torch.ceil(
         torch.tensor(d_kv_lens, dtype=torch.int32) / page_block_size
     ).int()
 
@@ -31,6 +34,14 @@ def run_bench(
     kv_indptr = torch.cat(
         [torch.tensor([0]), torch.cumsum(seq_lens_blocks, 0)], dim=0
     ).int()
+
+    p_q_indptr = torch.cat(
+        [torch.tensor([0]), torch.cumsum(torch.tensor(p_qo_lens), 0)], dim=0
+    ).int()
+    p_kv_indptr = torch.cat(
+        [torch.tensor([0]), torch.cumsum(p_seq_lens_blocks, 0)], dim=0
+    ).int()
+
     d_q_indptr = torch.cat(
         [torch.tensor([0]), torch.cumsum(torch.tensor(d_qo_lens), 0)], dim=0
     ).int()
@@ -46,7 +57,7 @@ def run_bench(
         device, dtype=torch.bfloat16
     )
 
-    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device=device)
+    workspace_buffer = torch.empty(156 * 1024 * 1024, dtype=torch.uint8, device=device)
     kv_layout = "NHD"
 
     wrapper_old = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
@@ -72,7 +83,85 @@ def run_bench(
     measurements = bench_gpu_time(lambda: wrapper_old.run(q, kv_data))
     ms_old = np.median(measurements)
 
+    wrapper_persistent = flashinfer.BatchAttention(kv_layout="NHD")
+    wrapper_persistent.plan(
+        q_indptr.to(device),
+        kv_indptr.to(device),
+        torch.arange(num_blocks, dtype=torch.int32, device=device),
+        seq_lens.to(device),
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        head_dim,
+        page_block_size,
+        causal=causal,
+        q_data_type=torch.bfloat16,
+        kv_data_type=torch.bfloat16,
+    )
+    o_persistent, _ = wrapper_persistent.run(q, kv_data)
+    measurements_persistent = bench_gpu_time(lambda: wrapper_persistent.run(q, kv_data))
+    ms_persistent = np.mean(measurements_persistent)
+
+    # Batched POD Attention
+    q_d = q[: d_q_indptr[-1]]
+    kv_d = kv_data[: d_kv_indptr[-1]].unbind(1)
+    q_p = q[d_q_indptr[-1] :]
+    kv_p = kv_data[d_kv_indptr[-1] :].unbind(1)
+    kv_indices_d = torch.arange(0, d_kv_indptr[-1], device=device, dtype=torch.int32)
+    kv_indices_p = torch.arange(0, p_kv_indptr[-1], device=device, dtype=torch.int32)
+
+    last_page_len_d = (d_seq_lens_blocks - 1) % page_block_size + 1
+    last_page_len_p = (p_seq_lens_blocks - 1) % page_block_size + 1
+    wrapper_pod = flashinfer.BatchPODWithPagedKVCacheWrapper(
+        workspace_buffer,
+        kv_layout=kv_layout,
+    )
+
+    wrapper_pod.plan(
+        # Prefill params
+        p_q_indptr.to(device),
+        p_kv_indptr.to(device),
+        kv_indices_p.to(device),
+        last_page_len_p,
+        # Decode params
+        d_q_indptr.to(device),
+        d_kv_indptr.to(device),
+        kv_indices_d.to(device),
+        last_page_len_d,
+        # Common params
+        num_qo_heads=num_qo_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_dim,
+        page_size=page_block_size,
+        q_data_type=torch.bfloat16,
+        kv_data_type=torch.bfloat16,
+    )
+    o_p_batch, o_d_batch = wrapper_pod.run(
+        q_p,
+        kv_p,
+        q_d,
+        kv_d,
+        causal_p=causal,
+    )
+    o_batch_pod = torch.cat([o_d_batch, o_p_batch], dim=0)
+
+    # Verify output matches
+    torch.testing.assert_close(
+        o_batch_pod, o, rtol=4e-3, atol=4e-3, msg="Batch POD-Attention decode mismatch!"
+    )
+    measurements = bench_gpu_time(
+        lambda: wrapper_pod.run(
+            q_p,
+            kv_p,
+            q_d,
+            kv_d,
+            causal_p=causal,
+        )
+    )
+    ms_batch_pod = np.median(measurements)
+
     if len(p_kv_lens) == 1:
+        # Single POD attention
         q_d = q[: d_q_indptr[-1]]
         kv_d = kv_data[: d_kv_indptr[-1]].unbind(1)
         q_p = q[d_q_indptr[-1] :]
@@ -109,7 +198,7 @@ def run_bench(
         o_pod = torch.cat([o_d, o_p], dim=0)
         # Verify output matches
         torch.testing.assert_close(
-            o, o_pod, rtol=1e-3, atol=1e-3, msg="POD-Attention output mismatch!"
+            o, o_pod, rtol=4e-3, atol=4e-3, msg="POD-Attention output mismatch!"
         )
         measurements = bench_gpu_time(
             lambda: wrapper_pod.run(
@@ -123,9 +212,51 @@ def run_bench(
             )
         )
         ms_pod = np.median(measurements)
+
+        # Sequential two kernels: single prefill + batch decode (tensor cores)
+        # Prefill using single_prefill_with_kv_cache
+        def _run_single_prefill():
+            return flashinfer.prefill.single_prefill_with_kv_cache(
+                q_p,
+                k_p,
+                v_p,
+                causal=causal,
+                pos_encoding_mode="NONE",
+                backend="fa2",
+            )
+
+        measurements_prefill = bench_gpu_time(lambda: _run_single_prefill())
+        ms_prefill = np.median(measurements_prefill)
+
+        # Batch decode using tensor cores
+        wrapper_decode = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+            workspace_buffer, kv_layout=kv_layout, use_tensor_cores=True
+        )
+        wrapper_decode.plan(
+            d_kv_indptr.to(device),
+            kv_indices_d.to(device),
+            last_page_len_d,
+            num_qo_heads,
+            num_kv_heads,
+            head_dim,
+            page_block_size,
+            data_type=torch.bfloat16,
+            q_data_type=torch.bfloat16,
+        )
+        measurements_decode = bench_gpu_time(lambda: wrapper_decode.run(q_d, kv_d))
+        ms_decode = np.median(measurements_decode)
+        ms_seq_two_kernels = ms_prefill + ms_decode
+
     print(f"Elapsed time (Batched Prefill): {ms_old:.2f} ms")
+    print(f"Elapsed time (Batched POD Attention): {ms_batch_pod:.2f} ms")
     if len(p_kv_lens) == 1:
         print(f"Elapsed time (POD Attention): {ms_pod:.2f} ms")
+        print(f"Elapsed time (Sequential two kernels): {ms_seq_two_kernels:.2f} ms")
+    print(f"Elapsed time (Persistent BatchAttention): {ms_persistent:.2f} ms")
+    print(
+        f"Batch POD speedup over Persistent BatchAttention: {ms_persistent / ms_batch_pod:.2f}x"
+    )
+
     total_bytes = (
         q.numel() * q.element_size() + kv_data.numel() * kv_data.element_size()
     )
@@ -134,9 +265,21 @@ def run_bench(
     bandwidth_old_gb_s = total_bytes / (ms_old * 1e-3) / (1024**3)
 
     print(f"Memory bandwidth (Batched Prefill): {bandwidth_old_gb_s:.2f} GB/s")
+    bandwidth_batch_pod_gb_s = total_bytes / (ms_batch_pod * 1e-3) / (1024**3)
+    print(
+        f"Memory bandwidth (Batched POD Attention): {bandwidth_batch_pod_gb_s:.2f} GB/s"
+    )
     if len(p_kv_lens) == 1:
         bandwidth_pod_gb_s = total_bytes / (ms_pod * 1e-3) / (1024**3)
         print(f"Memory bandwidth (POD Attention): {bandwidth_pod_gb_s:.2f} GB/s")
+        bandwidth_seq_gb_s = total_bytes / (ms_seq_two_kernels * 1e-3) / (1024**3)
+        print(
+            f"Memory bandwidth (Sequential two kernels): {bandwidth_seq_gb_s:.2f} GB/s"
+        )
+    bandwidth_persistent_gb_s = total_bytes / (ms_persistent * 1e-3) / (1024**3)
+    print(
+        f"Memory bandwidth (Persistent BatchAttention): {bandwidth_persistent_gb_s:.2f} GB/s"
+    )
 
 
 if __name__ == "__main__":
@@ -144,74 +287,26 @@ def run_bench(
     torch.random.manual_seed(42)
 
     # Irregular sequence lengths for prefill and decode
-    d_q_len_configs = [[1] * 122, [1] * 128, [1] * 242, [1] * 256]
-    d_kv_len_configs = [[600] * 122, [10000] * 128, [400] * 242, [8192] * 256]
-    p_q_configs = [[17] * 1, [10000], [17] * 1, []]
-    p_kv_configs = [[10000] * 1, [10000], [8192] * 1, []]
-
-    # construct random length testcases
-    for _ in range(1):
-        bsz = 256
-        stride = 16
-        sparsity = 0.05
-
-        full_kv_len = np.random.randint(1000, 8192, size=bsz)
-        p_q_lens = []
-        p_kv_lens = []
-        d_q_lens = []
-        d_kv_lens = []
-        for i in range(bsz):
-            if i % stride == 0:
-                kv_len = full_kv_len[i]
-                qo_len = stride + 1
-                p_q_lens.append(qo_len)
-                p_kv_lens.append(kv_len)
-            else:
-                kv_len = int(full_kv_len[i] * sparsity)
-                qo_len = 1
-                d_q_lens.append(qo_len)
-                d_kv_lens.append(kv_len)
-
-        p_q_configs.append(p_q_lens)
-        p_kv_configs.append(p_kv_lens)
-        d_q_len_configs.append(d_q_lens)
-        d_kv_len_configs.append(d_kv_lens)
-
-    for _ in range(1):
-        bsz = 128
-        stride = 16
-        sparsity = 0.05
-
-        full_kv_len = np.random.randint(2000, 16000, size=bsz)
-        p_q_lens = []
-        p_kv_lens = []
-        d_q_lens = []
-        d_kv_lens = []
-
-        for i in range(bsz):
-            if i % stride == 0:
-                kv_len = full_kv_len[i]
-                qo_len = stride + 1
-                p_q_lens.append(qo_len)
-                p_kv_lens.append(kv_len)
-            else:
-                kv_len = int(full_kv_len[i] * sparsity)
-                qo_len = 1
-                d_q_lens.append(qo_len)
-                d_kv_lens.append(kv_len)
-
-        p_q_configs.append(p_q_lens)
-        p_kv_configs.append(p_kv_lens)
-        d_q_len_configs.append(d_q_lens)
-        d_kv_len_configs.append(d_kv_lens)
+    d_q_len_configs = [[1] * 128] * 7
+    d_kv_len_configs = [
+        [2048] * 128,
+        [2048] * 128,
+        [2048] * 128,
+        [2048] * 128,
+        [4096] * 128,
+        [8192] * 128,
+        [8192] * 128,
+    ]
+    p_q_configs = [[512], [1536], [2048] * 2, [2048], [4096], [4096], [6000]]
+    p_kv_configs = [[512], [1536], [2048] * 2, [2048], [4096], [4096], [7000]]
 
     page_block_size = 1
-    num_kv_heads = 4
-    num_qo_heads = 28
+    num_kv_heads = 8
+    num_qo_heads = 32
     head_dim = 128
 
     for idx, (p_q_lens, p_kv_lens, d_q_len, d_kv_len) in enumerate(
-        zip(p_q_configs, p_kv_configs, d_q_len_configs, d_kv_len_configs)
+        zip(p_q_configs, p_kv_configs, d_q_len_configs, d_kv_len_configs, strict=True)
     ):
         print(f"===== Benchmark {idx + 1}: (kv_len, qo_len) set =====")
         run_bench(
diff --git a/benchmarks/bench_mm_fp8.py b/benchmarks/bench_mm_fp8.py
index a4df76ebd9..7661d5a57e 100644
--- a/benchmarks/bench_mm_fp8.py
+++ b/benchmarks/bench_mm_fp8.py
@@ -67,11 +67,12 @@ def bench_mm_fp8(m, n, k, in_dtype, out_dtype):
             input_fp8,
             prepared_weights,
             global_scale,
-            res,
+            out=res,
         ),
-        dry_run_time_ms=500,
-        repeat_time_ms=2500,
+        dry_run_time_ms=25,
+        repeat_time_ms=100,  # 100ms should be enough for low latency kernels that run within 100 usec
         use_cuda_graph=True,
+        enable_cupti=True,
     )
     ms = np.median(measurements)
     tflops_per_second = 2 * m * n * k * 1e-9 / ms
diff --git a/benchmarks/bench_rope_quantize_fp8_append_cache.py b/benchmarks/bench_rope_quantize_fp8_append_cache.py
new file mode 100644
index 0000000000..3119b9fef8
--- /dev/null
+++ b/benchmarks/bench_rope_quantize_fp8_append_cache.py
@@ -0,0 +1,342 @@
+"""
+Copyright (c) 2024 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import sys
+import argparse
+import flashinfer
+import numpy as np
+import torch
+from flashinfer.testing.utils import bench_gpu_time_with_cudagraph
+from flashinfer.utils import get_gpu_memory_bandwidth
+
+# Add the project root to Python path to import test helpers
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+from tests.test_helpers.rope_reference import RotaryEmbedding
+
+
+def benchmark_config(
+    config_name,
+    num_tokens,
+    batch_size=4,
+    page_size=16,
+    enable_pdl=False,
+    single_run=False,
+):
+    """Benchmark a specific attention configuration with paged KV cache append."""
+    input_dtype = torch.bfloat16
+    device = "cuda"
+    quant_dtype = torch.float8_e4m3fn
+
+    # Configuration-specific parameters
+    if config_name == "mla":
+        # MLA: DeepSeek-style multi-latent attention
+        num_qo_heads, num_kv_heads = 128, 1
+        rope_dim, no_rope_dim = 64, 512
+    elif config_name == "gqa":
+        # GQA: Grouped-query attention (e.g., Llama-style)
+        num_qo_heads, num_kv_heads = 32, 8
+        rope_dim, no_rope_dim = 64, 64
+    elif config_name == "mha":
+        # MHA: Standard multi-head attention
+        num_qo_heads, num_kv_heads = 32, 32
+        rope_dim, no_rope_dim = 64, 64
+    else:
+        raise ValueError(f"Unknown config: {config_name}")
+
+    head_dim = rope_dim + no_rope_dim
+
+    # Create input tensors
+    if config_name == "mla":
+        # MLA: 2D K tensors (shared)
+        q_rope = torch.randn(
+            num_tokens, num_qo_heads, rope_dim, dtype=input_dtype, device=device
+        )
+        q_nope = torch.randn(
+            num_tokens, num_qo_heads, no_rope_dim, dtype=input_dtype, device=device
+        )
+        k_rope = torch.randn(num_tokens, rope_dim, dtype=input_dtype, device=device)
+        k_nope = torch.randn(num_tokens, no_rope_dim, dtype=input_dtype, device=device)
+        v = None
+    else:
+        # GQA/MHA: 3D K/V tensors
+        q_rope = torch.randn(
+            num_tokens, num_qo_heads, rope_dim, dtype=input_dtype, device=device
+        )
+        q_nope = torch.randn(
+            num_tokens, num_qo_heads, no_rope_dim, dtype=input_dtype, device=device
+        )
+        k_rope = torch.randn(
+            num_tokens, num_kv_heads, rope_dim, dtype=input_dtype, device=device
+        )
+        k_nope = torch.randn(
+            num_tokens, num_kv_heads, no_rope_dim, dtype=input_dtype, device=device
+        )
+        v = torch.randn(
+            num_tokens, num_kv_heads, head_dim, dtype=input_dtype, device=device
+        )
+
+    # Create RoPE reference for cos/sin cache (ensure it covers this run)
+    max_seq_len = int(num_tokens)
+    rope_ref = RotaryEmbedding(
+        head_size=head_dim,
+        rotary_dim=rope_dim,
+        max_position_embeddings=max_seq_len,
+        base=10000,
+        is_neox_style=False,
+        dtype=input_dtype,
+        device=device,
+    )
+    pos_ids = torch.arange(num_tokens, device=device, dtype=torch.int32)
+
+    # Build paged metadata (single request with all tokens)
+    kv_append_length = torch.tensor(
+        [num_tokens] + [0] * (batch_size - 1), dtype=torch.int32, device=device
+    )
+    kv_append_indptr = torch.cat(
+        [
+            torch.zeros(1, dtype=torch.int32, device=device),
+            torch.cumsum(kv_append_length, dim=0),
+        ]
+    )
+    num_pages_per_req = torch.tensor(
+        [(num_tokens + page_size - 1) // page_size] + [0] * (batch_size - 1),
+        dtype=torch.int32,
+        device=device,
+    )
+    kv_page_indptr = torch.cat(
+        [
+            torch.zeros(1, dtype=torch.int32, device=device),
+            torch.cumsum(num_pages_per_req, dim=0),
+        ]
+    )
+    kv_page_indices = torch.arange(
+        kv_page_indptr[-1].item(), dtype=torch.int32, device=device
+    )
+    kv_last_page_len = torch.tensor(
+        [num_tokens % page_size if num_tokens % page_size != 0 else page_size]
+        + [0] * (batch_size - 1),
+        dtype=torch.int32,
+        device=device,
+    )
+
+    # Get batch_indices and positions
+    seq_lens = flashinfer.get_seq_lens(kv_page_indptr, kv_last_page_len, page_size)
+    batch_indices, positions = flashinfer.get_batch_indices_positions(
+        kv_append_indptr, seq_lens, num_tokens
+    )
+
+    # Allocate caches
+    max_pages = kv_page_indptr[-1].item()
+
+    if config_name == "mla":
+        ckv_cache = torch.zeros(
+            max_pages, page_size, no_rope_dim, dtype=quant_dtype, device=device
+        )
+        kpe_cache = torch.zeros(
+            max_pages, page_size, rope_dim, dtype=quant_dtype, device=device
+        )
+        paged_kv_cache = (ckv_cache, kpe_cache)
+    else:
+        # GQA/MHA: use NHD layout
+        k_cache = torch.zeros(
+            max_pages,
+            page_size,
+            num_kv_heads,
+            head_dim,
+            dtype=quant_dtype,
+            device=device,
+        )
+        v_cache = torch.zeros(
+            max_pages,
+            page_size,
+            num_kv_heads,
+            head_dim,
+            dtype=quant_dtype,
+            device=device,
+        )
+        paged_kv_cache = (k_cache, v_cache)
+
+    run_idx = 0
+
+    def execute():
+        if single_run:
+            import torch.cuda.nvtx as nvtx
+
+            nvtx.range_push("rope_append")
+        nonlocal run_idx
+        run_idx += 1
+
+        flashinfer.rope.rope_quantize_fp8_append_paged_kv_cache(
+            q_rope=q_rope,
+            k_rope=k_rope,
+            q_nope=q_nope,
+            k_nope=k_nope,
+            v=v,
+            cos_sin_cache=rope_ref.cos_sin_cache,
+            pos_ids=pos_ids,
+            paged_kv_cache=paged_kv_cache,
+            kv_indices=kv_page_indices,
+            kv_indptr=kv_page_indptr,
+            batch_indices=batch_indices,
+            positions=positions,
+            page_size=page_size,
+            kv_layout="NHD" if config_name != "mla" else "NHD",
+            quantize_dtype=quant_dtype,
+            quant_scale_q=1.0,
+            quant_scale_kv=1.0,
+            is_neox=False,
+            enable_pdl=enable_pdl,
+        )
+        if single_run:
+            # Ensure kernels complete inside the NVTX range for ncu filtering
+            torch.cuda.synchronize()
+            nvtx.range_pop()
+
+    if single_run:
+        execute()
+        return None, None, None, None, None
+    measurements = bench_gpu_time_with_cudagraph(execute)
+
+    # Calculate I/O bytes
+    # Inputs: q_rope, k_rope, q_nope, k_nope, v (if not MLA), cos_sin_cache, pos_ids
+    io_bytes = (
+        q_rope.numel() * q_rope.element_size()
+        + k_rope.numel() * k_rope.element_size()
+        + q_nope.numel() * q_nope.element_size()
+        + k_nope.numel() * k_nope.element_size()
+        + rope_ref.cos_sin_cache.numel() * rope_ref.cos_sin_cache.element_size()
+        + pos_ids.numel() * pos_ids.element_size()
+    )
+
+    if v is not None:
+        io_bytes += v.numel() * v.element_size()
+
+    # Outputs: q_rope_out, q_nope_out (FP8), cache writes (FP8)
+    io_bytes += (
+        q_rope.numel() * torch.finfo(quant_dtype).bits // 8
+        + q_nope.numel() * torch.finfo(quant_dtype).bits // 8
+    )
+
+    if config_name == "mla":
+        # MLA writes to ckv_cache and kpe_cache
+        io_bytes += (
+            num_tokens * no_rope_dim * torch.finfo(quant_dtype).bits // 8
+            + num_tokens * rope_dim * torch.finfo(quant_dtype).bits // 8
+        )
+    else:
+        # GQA/MHA writes to k_cache and v_cache
+        io_bytes += (
+            num_tokens * num_kv_heads * head_dim * torch.finfo(quant_dtype).bits // 8
+            + num_tokens * num_kv_heads * head_dim * torch.finfo(quant_dtype).bits // 8
+        )
+
+    # Calculate statistics
+    ms = np.median(measurements)
+    min_ms = np.percentile(measurements, 20)
+    max_ms = np.percentile(measurements, 80)
+
+    # Calculate bandwidth in GB/s
+    bandwidth_gb_s = io_bytes / ms / 1e6
+
+    # Calculate TFLOPs (FP operations)
+    # RoPE: 6 FLOPs per dimension pair (2 muls + 1 sub for real, 2 muls + 1 add for imag)
+    # For Q: num_tokens * num_qo_heads * (rope_dim/2) pairs * 6 FLOPs
+    # For K: depends on architecture
+    q_flops = num_tokens * num_qo_heads * (rope_dim / 2) * 6
+
+    if config_name == "mla":
+        # MLA: K is 2D (no head dimension)
+        k_flops = num_tokens * (rope_dim / 2) * 6
+    else:
+        # GQA/MHA: K is 3D (has head dimension)
+        k_flops = num_tokens * num_kv_heads * (rope_dim / 2) * 6
+
+    total_flops = q_flops + k_flops
+    tflops = (
+        total_flops / ms / 1e9
+    )  # TFLOPs (operations per ms = operations per second / 1e12)
+
+    return ms, min_ms, max_ms, bandwidth_gb_s, tflops
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ncu-single", action="store_true", help="Run a single execute() for ncu"
+    )
+    parser.add_argument(
+        "--config", type=str, default="", help="Config name: mla/gqa/mha"
+    )
+    parser.add_argument("--num-tokens", type=int, default=0)
+    parser.add_argument("--page-size", type=int, default=16)
+    parser.add_argument("--enable-pdl", type=int, default=0)
+    args, unknown = parser.parse_known_args()
+
+    if args.ncu_single:
+        # Minimal single-run for ncu profiling
+        cfg = args.config or "mla"
+        ntok = int(args.num_tokens)
+        pgsz = int(args.page_size)
+        en_pdl = bool(int(args.enable_pdl))
+        # Force a single execution path
+        benchmark_config(cfg, ntok, page_size=pgsz, enable_pdl=en_pdl, single_run=True)
+        sys.exit(0)
+
+    # Get GPU information (for display only)
+    device = torch.device("cuda:0")
+    gpu_name = torch.cuda.get_device_name(0)
+    gpu_peak_bandwidth = get_gpu_memory_bandwidth(device)
+    print(f"\nDetected GPU: {gpu_name}")
+    print(f"Theoretical Peak Memory Bandwidth: {gpu_peak_bandwidth:.2f} GB/s")
+    print()
+
+    # Token counts to benchmark
+    token_counts = [1, 32, 128, 384, 768, 1024, 2048, 4096, 8192]
+
+    # Helper function to print a table for a specific configuration
+    def print_config_table(config_name, config_desc):
+        page_size_to_benchmark = 32
+        print(f"\n{'=' * 100}")
+        print(f"  {config_name.upper()}: {config_desc}")
+        print(f"{'=' * 100}")
+
+        print(
+            f"{'Tokens':<10} {'Time (ms)':<12} {'BW (GB/s)':<12} {'BW% (Peak)':<14} {'TFLOPs':<12}"
+        )
+        print("-" * 70)
+        for num_tokens in token_counts:
+            ms, _, _, bw, tflops = benchmark_config(
+                config_name, num_tokens, page_size=page_size_to_benchmark
+            )
+            bw_pct = (bw / gpu_peak_bandwidth) * 100
+            print(
+                f"{num_tokens:<10} {ms:<12.5f} {bw:<12.2f} {bw_pct:<14.1f} {tflops:<12.3f}"
+            )
+
+    # Print tables for each configuration
+    print_config_table("mla", "128 Q heads, 1 K head, 64+512 dims (DeepSeek-style)")
+    print_config_table("gqa", "32 Q heads, 8 K heads, 64+64 dims (Llama-style)")
+    print_config_table("mha", "32 Q heads, 32 K heads, 64+64 dims (Standard)")
+
+    print("\n" + "=" * 100)
+    print("Configuration details:")
+    print("  Page size: 32, Batch size: 4")
+    print("  Token range: 1 (single decode) → 8192 (large prefill)")
+    print(f"  GPU: {gpu_name}")
+    print(f"  Theoretical Peak Memory Bandwidth: {gpu_peak_bandwidth:.2f} GB/s")
+    print("  BW% calculated as: (achieved_bandwidth / peak_bandwidth) * 100")
+    print("=" * 100)
diff --git a/benchmarks/bench_sampling.py b/benchmarks/bench_sampling.py
index 2eb2de3875..cc2406e43f 100644
--- a/benchmarks/bench_sampling.py
+++ b/benchmarks/bench_sampling.py
@@ -220,6 +220,86 @@ def main():
                         f"vocab_size: {vocab_size}, batch_size: {batch_size}, distrib: {distrib.__name__}, deterministic: {deterministic}, duration: {ms * 1e3:.2f} us, effective bandwidth: {bandwidth:.2f} GB/s"
                     )
 
+    print("---")
+    print("top-p renorm probs")
+    for vocab_size in [128512]:
+        for batch_size in [1, 16, 32, 64, 128, 256, 512]:
+            torch.manual_seed(42)
+            for distrib in [
+                normal_distribution(1),
+                normal_distribution(5),
+                gumbel_distribution(0.1),
+                gumbel_distribution(1),
+            ]:
+                for p in [0.1, 0.5, 0.9]:
+                    logits = distrib((batch_size, vocab_size), device="cuda")
+                    probs = torch.softmax(logits, dim=-1)
+                    measurements = bench_gpu_time(
+                        lambda: flashinfer.sampling.top_p_renorm_probs(probs, p),
+                        dry_run_time_ms=100,
+                        repeat_time_ms=1000,
+                    )
+                    ms = np.median(measurements)
+
+                    io = probs.numel() * probs.element_size() * 2
+                    bandwidth = io * 1e-6 / ms
+                    print(
+                        f"vocab_size: {vocab_size}, batch_size: {batch_size}, distrib: {distrib.__name__}, p: {p}, duration: {ms * 1e3:.2f} us, effective bandwidth: {bandwidth:.2f} GB/s"
+                    )
+
+    print("---")
+    print("top-k renorm probs")
+    for vocab_size in [128512]:
+        for batch_size in [1, 16, 32, 64, 128, 256, 512]:
+            torch.manual_seed(42)
+            for distrib in [
+                normal_distribution(1),
+                normal_distribution(5),
+                gumbel_distribution(0.1),
+                gumbel_distribution(1),
+            ]:
+                for k in [10, 100, 1000, 5000]:
+                    logits = distrib((batch_size, vocab_size), device="cuda")
+                    probs = torch.softmax(logits, dim=-1)
+                    measurements = bench_gpu_time(
+                        lambda: flashinfer.sampling.top_k_renorm_probs(probs, k),
+                        dry_run_time_ms=100,
+                        repeat_time_ms=1000,
+                    )
+                    ms = np.median(measurements)
+
+                    io = probs.numel() * probs.element_size() * 2
+                    bandwidth = io * 1e-6 / ms
+                    print(
+                        f"vocab_size: {vocab_size}, batch_size: {batch_size}, distrib: {distrib.__name__}, k: {k}, duration: {ms * 1e3:.2f} us, effective bandwidth: {bandwidth:.2f} GB/s"
+                    )
+
+    print("---")
+    print("top-k mask logits")
+    for vocab_size in [128512]:
+        for batch_size in [1, 16, 32, 64, 128, 256, 512]:
+            torch.manual_seed(42)
+            for distrib in [
+                normal_distribution(1),
+                normal_distribution(5),
+                gumbel_distribution(0.1),
+                gumbel_distribution(1),
+            ]:
+                for k in [10, 100, 1000, 5000]:
+                    logits = distrib((batch_size, vocab_size), device="cuda")
+                    measurements = bench_gpu_time(
+                        lambda: flashinfer.sampling.top_k_mask_logits(logits, k),
+                        dry_run_time_ms=100,
+                        repeat_time_ms=1000,
+                    )
+                    ms = np.median(measurements)
+
+                    io = logits.numel() * logits.element_size() * 2
+                    bandwidth = io * 1e-6 / ms
+                    print(
+                        f"vocab_size: {vocab_size}, batch_size: {batch_size}, distrib: {distrib.__name__}, k: {k}, duration: {ms * 1e3:.2f} us, effective bandwidth: {bandwidth:.2f} GB/s"
+                    )
+
 
 if __name__ == "__main__":
     main()
diff --git a/benchmarks/bench_softmax.py b/benchmarks/bench_softmax.py
new file mode 100755
index 0000000000..6da8dc9fcb
--- /dev/null
+++ b/benchmarks/bench_softmax.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python3
+"""
+Benchmark script comparing torch.softmax vs flashinfer.softmax performance.
+Creates a heatmap showing speedup across different batch sizes and hidden dimensions.
+"""
+
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+import seaborn as sns
+from typing import List, Tuple
+import flashinfer
+from flashinfer.testing.utils import bench_gpu_time
+
+
+@torch.inference_mode()
+def benchmark_torch_softmax(logits: torch.Tensor) -> float:
+    """Benchmark torch's native softmax."""
+    measurements = bench_gpu_time(
+        lambda: torch.softmax(logits, dim=-1),
+        dry_run_time_ms=100,
+        repeat_time_ms=1000,
+    )
+    return np.median(measurements)
+
+
+@torch.inference_mode()
+def benchmark_flashinfer_softmax(logits: torch.Tensor) -> float:
+    """Benchmark flashinfer's softmax."""
+    measurements = bench_gpu_time(
+        lambda: flashinfer.sampling.softmax(logits, temperature=None, enable_pdl=False),
+        dry_run_time_ms=100,
+        repeat_time_ms=1000,
+    )
+    return np.median(measurements)
+
+
+def run_benchmark(
+    batch_sizes: List[int], hidden_sizes: List[int]
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Run benchmarks for all combinations of batch_size and hidden_size.
+
+    Returns:
+        torch_times: 2D array of torch softmax times (ms)
+        flashinfer_times: 2D array of flashinfer softmax times (ms)
+        speedups: 2D array of speedup ratios (torch_time / flashinfer_time)
+    """
+    n_batch = len(batch_sizes)
+    n_hidden = len(hidden_sizes)
+
+    torch_times = np.zeros((n_batch, n_hidden))
+    flashinfer_times = np.zeros((n_batch, n_hidden))
+    speedups = np.zeros((n_batch, n_hidden))
+
+    print("Running benchmarks...")
+    print("=" * 100)
+    print(
+        f"{'Batch Size':<12} {'Hidden Size':<12} {'Torch (ms)':<15} "
+        f"{'FlashInfer (ms)':<18} {'Speedup':<10} {'Bandwidth (GB/s)':<18}"
+    )
+    print("=" * 100)
+
+    for i, batch_size in enumerate(batch_sizes):
+        for j, hidden_size in enumerate(hidden_sizes):
+            # Generate random logits
+            torch.manual_seed(42)
+            logits = torch.randn(
+                batch_size, hidden_size, device="cuda", dtype=torch.float32
+            )
+
+            # Benchmark torch softmax
+            torch_time_ms = benchmark_torch_softmax(logits)
+            torch_times[i, j] = torch_time_ms
+
+            # Benchmark flashinfer softmax
+            flashinfer_time_ms = benchmark_flashinfer_softmax(logits)
+            flashinfer_times[i, j] = flashinfer_time_ms
+
+            # Calculate speedup
+            speedup = torch_time_ms / flashinfer_time_ms
+            speedups[i, j] = speedup
+
+            # Calculate effective bandwidth (read + write)
+            io_bytes = logits.numel() * logits.element_size() * 2
+            bandwidth_gb_s = io_bytes * 1e-6 / flashinfer_time_ms
+
+            print(
+                f"{batch_size:<12} {hidden_size:<12} {torch_time_ms:<15.4f} "
+                f"{flashinfer_time_ms:<18.4f} {speedup:<10.2f}x {bandwidth_gb_s:<18.2f}"
+            )
+
+    print("=" * 100)
+    return torch_times, flashinfer_times, speedups
+
+
+def plot_heatmap(
+    speedups: np.ndarray,
+    batch_sizes: List[int],
+    hidden_sizes: List[int],
+    save_path: str = "softmax_speedup_heatmap.png",
+):
+    """Create and save a heatmap of speedup values."""
+    # Create figure
+    fig, ax = plt.subplots(figsize=(12, 8))
+
+    # Create heatmap
+    sns.heatmap(
+        speedups,
+        annot=True,
+        fmt=".2f",
+        cmap="RdYlGn",
+        center=1.0,
+        cbar_kws={"label": "Speedup (x)"},
+        xticklabels=[f"{h // 1000}K" for h in hidden_sizes],
+        yticklabels=batch_sizes,
+        ax=ax,
+        vmin=0.5,  # Adjust color scale
+        vmax=max(3.0, speedups.max()),  # Dynamic upper bound
+    )
+
+    ax.set_xlabel("Hidden Size", fontsize=12, fontweight="bold")
+    ax.set_ylabel("Batch Size", fontsize=12, fontweight="bold")
+    ax.set_title(
+        "FlashInfer Softmax Speedup vs PyTorch (Higher is Better)",
+        fontsize=14,
+        fontweight="bold",
+        pad=20,
+    )
+
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=300, bbox_inches="tight")
+    print(f"\nHeatmap saved to: {save_path}")
+
+    # Also create a performance comparison plot
+    _, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
+
+    # Plot 2: Speedup trends across batch sizes
+    for j, hidden_size in enumerate(hidden_sizes):
+        ax2.plot(
+            batch_sizes,
+            speedups[:, j],
+            marker="o",
+            label=f"Hidden={hidden_size // 1000}K",
+            linewidth=2,
+        )
+
+    ax2.set_xlabel("Batch Size", fontsize=12, fontweight="bold")
+    ax2.set_ylabel("Speedup (x)", fontsize=12, fontweight="bold")
+    ax2.set_title("Speedup vs Batch Size", fontsize=13, fontweight="bold")
+    ax2.set_xscale("log", base=2)
+    ax2.grid(True, alpha=0.3)
+    ax2.legend(fontsize=9)
+    ax2.axhline(y=1.0, color="red", linestyle="--", alpha=0.5, label="No speedup")
+
+    # Plot 1: Speedup trends across hidden sizes
+    for i, batch_size in enumerate(batch_sizes[::2]):  # Sample every other batch size
+        idx = i * 2
+        ax1.plot(
+            [h // 1000 for h in hidden_sizes],
+            speedups[idx, :],
+            marker="s",
+            label=f"Batch={batch_size}",
+            linewidth=2,
+        )
+
+    ax1.set_xlabel("Hidden Size (K)", fontsize=12, fontweight="bold")
+    ax1.set_ylabel("Speedup (x)", fontsize=12, fontweight="bold")
+    ax1.set_title("Speedup vs Hidden Size", fontsize=13, fontweight="bold")
+    ax1.grid(True, alpha=0.3)
+    ax1.legend(fontsize=9)
+    ax1.axhline(y=1.0, color="red", linestyle="--", alpha=0.5)
+
+    plt.tight_layout()
+    comparison_path = save_path.replace(".png", "_trends.png")
+    plt.savefig(comparison_path, dpi=300, bbox_inches="tight")
+    print(f"Trend plots saved to: {comparison_path}")
+
+
+def main():
+    """Main benchmark execution."""
+    # Configuration
+    batch_sizes = [1, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+    hidden_sizes = [32000, 64000, 128000, 256000]
+
+    print("=" * 100)
+    print("FlashInfer vs PyTorch Softmax Benchmark")
+    print("=" * 100)
+    print(f"Batch sizes: {batch_sizes}")
+    print(f"Hidden sizes: {hidden_sizes}")
+    print(f"Device: {torch.cuda.get_device_name()}")
+    print("=" * 100)
+    print()
+
+    # Run benchmarks
+    _, _, speedups = run_benchmark(batch_sizes, hidden_sizes)
+
+    # Print summary statistics
+    print("\nSummary Statistics:")
+    print("=" * 100)
+    print(f"Average speedup: {np.mean(speedups):.2f}x")
+    print(f"Median speedup: {np.median(speedups):.2f}x")
+    print(f"Min speedup: {np.min(speedups):.2f}x")
+    print(f"Max speedup: {np.max(speedups):.2f}x")
+    print("=" * 100)
+
+    # Generate heatmap
+    plot_heatmap(speedups, batch_sizes, hidden_sizes)
+
+    print("\nBenchmark complete!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/bench_trtllm_gen_fused_moe_autotuner.py b/benchmarks/bench_trtllm_gen_fused_moe_autotuner.py
index 952b479a1d..203faaff82 100644
--- a/benchmarks/bench_trtllm_gen_fused_moe_autotuner.py
+++ b/benchmarks/bench_trtllm_gen_fused_moe_autotuner.py
@@ -8,13 +8,164 @@
     fp4_quantize,
     mxfp8_quantize,
 )
-from flashinfer.fused_moe import trtllm_fp4_block_scale_moe
+from flashinfer.fused_moe import (
+    trtllm_fp4_block_scale_moe,
+    trtllm_fp8_per_tensor_scale_moe,
+    trtllm_fp8_block_scale_moe,
+    WeightLayout,
+)
 from flashinfer.autotuner import autotune
 from flashinfer.testing.utils import bench_gpu_time
-from flashinfer.utils import device_support_pdl, calculate_tile_tokens_dim
+from flashinfer.utils import device_support_pdl
+
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+FLOAT4_E2M1_MAX = 6.0
+
+
+def fp8_quantize(x):
+    max = x.abs().max().float()
+    scale = FLOAT8_E4M3_MAX / max
+    x = (x * scale).to(torch.float8_e4m3fn)
+    return x, 1.0 / scale
+
+
+def bench_trtllm_gen_fused_moe_autotuner_fp8(
+    tune_max_num_tokens: Optional[int],
+    quant_mode: Literal["Fp8-Per-Tensor", "Fp8-Block"],
+    num_tokens: int,
+    num_experts: int,
+    hidden_size: int,
+    intermediate_size: int,
+    top_k: int,
+    warmups: int,
+    iterations: int,
+):
+    device = torch.device("cuda:0")
+    enable_pdl = device_support_pdl(device)
+    routing_logits = torch.rand(num_tokens, num_experts, device=device).to(
+        torch.float32
+    )
+    hidden_states = torch.randn(num_tokens, hidden_size, device=device).to(
+        torch.bfloat16
+    )
+    routing_bias = torch.randn(num_experts, device="cuda", dtype=torch.bfloat16)
+    w13 = torch.randn(
+        num_experts, intermediate_size * 2, hidden_size, device=device
+    ).to(torch.bfloat16)
+    w2 = torch.randn(num_experts, hidden_size, intermediate_size, device=device).to(
+        torch.bfloat16
+    )
+
+    is_block_scale = quant_mode == "Fp8-Block"
+    if not is_block_scale:
+        hidden_states, hidden_states_scale = fp8_quantize(hidden_states)
+        w13, w13_scale = fp8_quantize(w13)
+        w2, w2_scale = fp8_quantize(w2)
+    else:
+        # block scale quantization is too slow, so we use per-tensor quantization for now
+        hidden_states, hidden_states_scale = fp8_quantize(hidden_states)
+        w13, w13_scale = fp8_quantize(w13)
+        w2, w2_scale = fp8_quantize(w2)
+        hidden_states_scale = torch.full(
+            (hidden_size // 128, num_tokens), hidden_states_scale.item(), device=device
+        )
+        w13_scale = torch.full(
+            (num_experts, intermediate_size * 2 // 128, hidden_size // 128),
+            w13_scale.item(),
+            device=device,
+        )
+        w2_scale = torch.full(
+            (num_experts, hidden_size // 128, intermediate_size // 128),
+            w2_scale.item(),
+            device=device,
+        )
+
+    output1_scale_scalar = (
+        torch.tensor([hidden_states_scale * w13_scale] * num_experts, device=device)
+        if not is_block_scale
+        else None
+    )
+    output1_scales_gate_scalar = (
+        torch.ones(num_experts, device=device, dtype=torch.float32)
+        if not is_block_scale
+        else None
+    )
+    output2_scale_scalar = (
+        torch.tensor([hidden_states_scale * w2_scale] * num_experts, device=device)
+        if not is_block_scale
+        else None
+    )
+
+    if is_block_scale:
+        fn = lambda: trtllm_fp8_block_scale_moe(
+            routing_logits,
+            routing_bias,
+            hidden_states,
+            hidden_states_scale,
+            w13,
+            w13_scale,
+            w2,
+            w2_scale,
+            num_experts,
+            top_k,
+            8,  # n_group
+            4,  # topk_group
+            intermediate_size,
+            0,  # local_expert_offset
+            num_experts,
+            2.5,  # routed_scaling_factor
+            RoutingMethodType.DeepSeekV3.value,
+            True,  # use_shuffled_weight
+            WeightLayout.BlockMajorK.value,  # weight_layout
+            enable_pdl=enable_pdl,
+            tune_max_num_tokens=num_tokens
+            if tune_max_num_tokens is None
+            else tune_max_num_tokens,
+        )
+    else:
+        fn = lambda: trtllm_fp8_per_tensor_scale_moe(
+            routing_logits,
+            None,  # routing_bias
+            hidden_states,
+            w13,
+            output1_scale_scalar,
+            output1_scales_gate_scalar,
+            w2,
+            output2_scale_scalar,
+            num_experts,
+            top_k,
+            None,  # n_group
+            None,  # topk_group
+            intermediate_size,
+            0,  # local_expert_offset
+            num_experts,
+            1.0,  # routed_scaling_factor
+            False,  # use_routing_scales_on_input
+            RoutingMethodType.TopK.value,
+            enable_pdl,
+            num_tokens if tune_max_num_tokens is None else tune_max_num_tokens,
+        )
+
+    def bench(do_autotune):
+        with autotune(do_autotune):
+            fn()
+        ms_list = bench_gpu_time(
+            fn,
+            dry_run_iters=warmups,
+            repeat_iters=iterations,
+        )
+        median_ms = np.median(ms_list)
+        return median_ms
 
+    ms = bench(do_autotune=False)
+    ms_tuned = bench(do_autotune=True)
+    print(
+        f"num tokens: {num_tokens}, num experts: {num_experts}, hidden size: {hidden_size}, intermediate size: {intermediate_size}, top k: {top_k}"
+    )
+    print(f"No autotune: {ms:.3f} ms; with autotune: {ms_tuned:.3f} ms")
 
-def bench_trtllm_gen_fused_moe_autotuner(
+
+def bench_trtllm_gen_fused_moe_autotuner_fp4(
     tune_max_num_tokens: Optional[int],
     quant_mode: Literal["NvFP4xNvFP4", "MxFP4xMxFP8", "MxFP4xBf16"],
     num_tokens: int,
@@ -39,6 +190,7 @@ def bench_trtllm_gen_fused_moe_autotuner(
             torch.tensor([448.0 * 6.0], device=device),
             sf_vec_size=16,
             sf_use_ue8m0=False,
+            is_sf_swizzled_layout=False,
         )
         hidden_states_scale = hidden_states_scale.view(torch.float8_e4m3fn).reshape(
             num_tokens, -1
@@ -99,9 +251,6 @@ def bench_trtllm_gen_fused_moe_autotuner(
     bias13 = torch.randn(num_experts, intermediate_size * 2, device=device) * 10
     bias2 = torch.randn(num_experts, intermediate_size * 2, device=device) * 10
 
-    tile_tokens_dim = calculate_tile_tokens_dim(
-        num_tokens, num_experts, top_k, 64 if quant_mode == "MxFP4xBf16" else 128
-    )
     output1_scale_scalar = torch.tensor(
         [hidden_states_global_scale * w13_global_scale] * num_experts, device=device
     )
@@ -136,7 +285,6 @@ def bench_trtllm_gen_fused_moe_autotuner(
         0,  # local_expert_offset
         num_experts,
         None,  # routed_scaling_factor
-        tile_tokens_dim,
         RoutingMethodType.Renormalize.value,
         True,
         enable_pdl,
@@ -146,12 +294,11 @@ def bench_trtllm_gen_fused_moe_autotuner(
     )
 
     def bench(do_autotune):
-        # warmup
         with autotune(do_autotune):
-            for _ in range(warmups):
-                fn()
+            fn()
         ms_list = bench_gpu_time(
             fn,
+            dry_run_iters=warmups,
             repeat_iters=iterations,
         )
         median_ms = np.median(ms_list)
@@ -171,7 +318,13 @@ def bench(do_autotune):
         "--quant-mode",
         type=str,
         default="MxFP4xMxFP8",
-        choices=["NvFP4xNvFP4", "MxFP4xMxFP8", "MxFP4xBf16"],
+        choices=[
+            "NvFP4xNvFP4",
+            "MxFP4xMxFP8",
+            "MxFP4xBf16",
+            "Fp8-Per-Tensor",
+            "Fp8-Block",
+        ],
         help="Quantization mode",
     )
     parser.add_argument("--num-tokens", type=int, default=512, help="Number of tokens")
@@ -196,14 +349,27 @@ def bench(do_autotune):
         "--iterations", type=int, default=100, help="Number of benchmark iterations"
     )
     args = parser.parse_args()
-    bench_trtllm_gen_fused_moe_autotuner(
-        args.tune_max_num_tokens,
-        args.quant_mode,
-        args.num_tokens,
-        args.num_experts,
-        args.hidden_size,
-        args.intermediate_size,
-        args.top_k,
-        args.warmups,
-        args.iterations,
-    )
+    if args.quant_mode in ["Fp8-Per-Tensor", "Fp8-Block"]:
+        bench_trtllm_gen_fused_moe_autotuner_fp8(
+            args.tune_max_num_tokens,
+            args.quant_mode,
+            args.num_tokens,
+            args.num_experts,
+            args.hidden_size,
+            args.intermediate_size,
+            args.top_k,
+            args.warmups,
+            args.iterations,
+        )
+    else:
+        bench_trtllm_gen_fused_moe_autotuner_fp4(
+            args.tune_max_num_tokens,
+            args.quant_mode,
+            args.num_tokens,
+            args.num_experts,
+            args.hidden_size,
+            args.intermediate_size,
+            args.top_k,
+            args.warmups,
+            args.iterations,
+        )
diff --git a/benchmarks/flashinfer_benchmark.py b/benchmarks/flashinfer_benchmark.py
index bd02172eb2..330d734221 100644
--- a/benchmarks/flashinfer_benchmark.py
+++ b/benchmarks/flashinfer_benchmark.py
@@ -79,7 +79,13 @@ def parse_args(line=sys.argv[1:]):
         "--use_cupti",
         action="store_true",
         default=False,
-        help="Use CUPTI for timing GPU kernels when available.",
+        help="[DEPRECATED] Use CUPTI for timing GPU kernels. This is now the default behavior.",
+    )
+    parser.add_argument(
+        "--use_cuda_events",
+        action="store_true",
+        default=False,
+        help="Use CUDA events for timing GPU kernels instead of CUPTI.",
     )
     parser.add_argument(
         "--refcheck",
@@ -155,6 +161,16 @@ def parse_args(line=sys.argv[1:]):
 
     if args.generate_repro_command:
         args.repro_command = "python3 flashinfer_benchmark.py " + " ".join(line)
+
+    # Deprecation warning for use_cupti
+    if args.use_cupti:
+        print(
+            "[WARNING] --use_cupti is deprecated and will be removed in a future release. CUPTI is now enabled by default."
+        )
+    # use_cupti is deprecated and will be removed in a future release. CUPTI is now enabled by default.
+    # If --use_cuda_events is passed, disable use_cupti
+    args.use_cupti = not args.use_cuda_events
+
     return args
 
 
diff --git a/benchmarks/routines/attention.py b/benchmarks/routines/attention.py
index bfebc37d4d..320cfbe020 100644
--- a/benchmarks/routines/attention.py
+++ b/benchmarks/routines/attention.py
@@ -19,6 +19,30 @@
 )
 
 
+def normalize_backends(backends):
+    """
+    Normalize backend names planned for deprecation and print warnings.
+    Currently:
+    - Replaces deprecated 'trtllm-gen-native' with 'trtllm-native'.
+
+    Args:
+        backends: List of backend names
+
+    Returns:
+        List of normalized backend names
+    """
+    normalized = []
+    for backend in backends:
+        if backend == "trtllm-gen-native":
+            print(
+                "[WARNING] Backend name 'trtllm-gen-native' has been renamed to 'trtllm-native' and will be removed in a future release. "
+            )
+            normalized.append("trtllm-native")
+        else:
+            normalized.append(backend)
+    return normalized
+
+
 def run_attention_test(args):
     """
     Run an attention test.
@@ -66,7 +90,8 @@ def parse_attention_args(line, parser):
             "cudnn",
             "cutlass",
             "trtllm-gen",
-            "trtllm-gen-native",
+            "trtllm-native",
+            "trtllm-gen-native",  # Deprecated, will be removed in future
         ],
         help="Kernel backends to test. Default: fa2",
     )
@@ -151,6 +176,10 @@ def parse_attention_args(line, parser):
     )
 
     args = parser.parse_args(line)
+
+    # Normalize backend names (handle deprecated names)
+    args.backends = normalize_backends(args.backends)
+
     if args.verbose >= 1:
         print(f"[INFO] {args = }")
     return args
@@ -185,7 +214,7 @@ def sample_actual_seq_lens(max_seqlen, batch_size, device, random_actual_seq_len
 def testBatchDecodeWithPagedKVCacheWrapper(args):
     """
     Test BatchDecodeWithPagedKVCacheWrapper API and equivalent cuDNN API.
-    Supports fa2, fa2_tc, cudnn, trtllm-gen, trtllm-gen-native backends.
+    Supports fa2, fa2_tc, cudnn, trtllm-gen, trtllm-native backends.
 
     This test:
     1. Creates paged KV cache and query tensors
@@ -367,7 +396,7 @@ def testBatchDecodeWithPagedKVCacheWrapper(args):
     # Now initialize the page tables
     block_tables = torch.tensor(
         [
-            [k + i * num_pages_per_seq for k in range(num_pages_per_seq)]
+            [k + i * num_pages_per_seq for k in torch.randperm(num_pages_per_seq)]
             for i in range(batch_size)
         ],
         dtype=torch.int,
@@ -392,11 +421,7 @@ def testBatchDecodeWithPagedKVCacheWrapper(args):
     for i in range(len(kv_indptr) - 1):
         start_idx = kv_indptr[i]
         end_idx = kv_indptr[i + 1]
-        kv_indices[start_idx:end_idx] = torch.arange(
-            i * num_pages_per_seq,
-            i * num_pages_per_seq + (end_idx - start_idx),
-            device=device,
-        )
+        kv_indices[start_idx:end_idx] = block_tables[i, : end_idx - start_idx]
 
     kv_last_page_len = (
         torch.where(
@@ -490,7 +515,7 @@ def run_backend_wrapper(backend):
                 batch_offsets_q=ragged_q,
                 batch_offsets_o=ragged_q,
             )
-        elif backend == "trtllm-gen-native":
+        elif backend == "trtllm-native":
             return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
                 query=q.contiguous(),
                 kv_cache=kv_cache,
@@ -508,6 +533,8 @@ def run_backend_wrapper(backend):
     has_reference_output = False
     # Iterate over each backend:
     for cur_backend in backends:
+        # Clear workspace buffer to prevent unexpected interactions between backends.
+        workspace_buffer.zero_()
         if run_refcheck:
             outputs[cur_backend] = run_backend_wrapper(cur_backend).detach().clone()
             if cur_backend == "fa2":
@@ -612,7 +639,7 @@ def run_backend_wrapper(backend):
 def testBatchPrefillWithPagedKVCacheWrapper(args):
     """
     Test BatchPrefillWithPagedKVCacheWrapper API and equivalent cuDNN API.
-    Supports fa2, fa3, trtllm-gen, trtllm-gen-native, and cudnn backends.
+    Supports fa2, fa3, trtllm-gen, trtllm-native, and cudnn backends.
 
     This test:
     1. Creates paged KV cache and query tensors for prefill
@@ -695,13 +722,13 @@ def testBatchPrefillWithPagedKVCacheWrapper(args):
             remove_trtllm = True
         if remove_trtllm:
             backends.remove("trtllm-gen")
-    if "trtllm-gen-native" in backends:
+    if "trtllm-native" in backends:
         remove_trtllm_native = False
         if not causal:
-            print("[INFO] trtllm-gen-native backend currently requires causal = True")
+            print("[INFO] trtllm-native backend currently requires causal = True")
             remove_trtllm_native = True
         if remove_trtllm_native:
-            backends.remove("trtllm-gen-native")
+            backends.remove("trtllm-native")
 
     if "cutlass" in backends:
         print("[INFO] CUTLASS backend does not support prefill. Skipping.")
@@ -806,7 +833,7 @@ def testBatchPrefillWithPagedKVCacheWrapper(args):
     # Now initialize the page tables
     block_tables = torch.tensor(
         [
-            [k + i * num_pages_per_seq for k in range(num_pages_per_seq)]
+            [k + i * num_pages_per_seq for k in torch.randperm(num_pages_per_seq)]
             for i in range(batch_size)
         ],
         dtype=torch.int,
@@ -856,11 +883,7 @@ def testBatchPrefillWithPagedKVCacheWrapper(args):
     for i in range(len(kv_indptr) - 1):
         start_idx = kv_indptr[i]
         end_idx = kv_indptr[i + 1]
-        kv_indices[start_idx:end_idx] = torch.arange(
-            i * num_pages_per_seq,
-            i * num_pages_per_seq + (end_idx - start_idx),
-            device=device,
-        )
+        kv_indices[start_idx:end_idx] = block_tables[i, : end_idx - start_idx]
     kv_last_page_len = (
         torch.where(
             actual_seq_lens_kv_device.flatten() % page_size == 0,
@@ -953,7 +976,7 @@ def run_backend_wrapper(backend):
                 batch_offsets_q=q_indptr,
                 batch_offsets_o=q_indptr,
             )[0]
-        elif backend == "trtllm-gen-native":
+        elif backend == "trtllm-native":
             return flashinfer.prefill.trtllm_batch_context_with_kv_cache(
                 query=q,
                 kv_cache=kv_cache,
@@ -975,6 +998,8 @@ def run_backend_wrapper(backend):
     has_reference_output = False
     # Iterate over each backend:
     for cur_backend in backends:
+        # Clear workspace buffer to prevent unexpected interactions between backends.
+        workspace_buffer.zero_()
         if run_refcheck:
             outputs[cur_backend] = run_backend_wrapper(cur_backend).detach().clone()
             if cur_backend == "fa2":
@@ -1174,21 +1199,21 @@ def testBatchPrefillWithRaggedKVCacheWrapper(args):
         remove_trtllm = True
         if remove_trtllm:
             backends.remove("trtllm-gen")
-    if "trtllm-gen-native" in backends:
+    if "trtllm-native" in backends:
         remove_trtllm_native = False
         if q_dtype in [torch.float8_e4m3fn, torch.float8_e5m2] or kv_dtype in [
             torch.float8_e4m3fn,
             torch.float8_e5m2,
         ]:
-            print("[INFO] trtllm-gen-native backend does not support FP8. Skipping.")
+            print("[INFO] trtllm-native backend does not support FP8. Skipping.")
             remove_trtllm_native = True
         if not (head_dim_qk == 192 and head_dim_vo == 128):
             print(
-                "[INFO] trtllm-gen-native backend requires head_dim_qk == 192 and head_dim_vo == 128"
+                "[INFO] trtllm-native backend requires head_dim_qk == 192 and head_dim_vo == 128"
             )
             remove_trtllm_native = True
         if remove_trtllm_native:
-            backends.remove("trtllm-gen-native")
+            backends.remove("trtllm-native")
 
     if len(backends) == 0:
         print("[ERROR] No backends to test. Exiting.")
@@ -1400,7 +1425,7 @@ def run_backend_wrapper(backend):
                 batch_offsets_stats=batch_offsets_stats,
                 is_cuda_graph_compatible=True,
             )[0]
-        elif backend == "trtllm-gen-native":
+        elif backend == "trtllm-native":
             return flashinfer.prefill.trtllm_ragged_attention_deepseek(
                 query=q,
                 key=k,
@@ -1427,6 +1452,8 @@ def run_backend_wrapper(backend):
     has_reference_output = False
     # Iterate over each backend:
     for cur_backend in backends:
+        # Clear workspace buffer to prevent unexpected interactions between backends.
+        workspace_buffer.zero_()
         if run_refcheck:
             outputs[cur_backend] = run_backend_wrapper(cur_backend).detach().clone()
             if cur_backend == "fa2":
@@ -1532,7 +1559,7 @@ def run_backend_wrapper(backend):
 def testBatchMLAPagedAttentionWrapper(args):
     """
     Test BatchMLAPagedAttentionWrapper and equivalent APIs.
-    Supports fa2, fa3, cutlass, and trtllm-gen-native.
+    Supports fa2, fa3, cutlass, and trtllm-native.
 
     This test:
     1. Creates paged query and key-value cache tensors
@@ -1628,15 +1655,15 @@ def testBatchMLAPagedAttentionWrapper(args):
             remove_cutlass = True
         if remove_cutlass:
             backends.remove("cutlass")
-    if "trtllm-gen-native" in backends:
+    if "trtllm-native" in backends:
         remove_trtllm_native = False
         if page_size not in [32, 64]:
             print(
-                "[INFO] trtllm-gen-native backend only supports page size 32 or 64. Skipping."
+                "[INFO] trtllm-native backend only supports page size 32 or 64. Skipping."
             )
             remove_trtllm_native = True
         if remove_trtllm_native:
-            backends.remove("trtllm-gen-native")
+            backends.remove("trtllm-native")
     if len(backends) == 0:
         print("[ERROR] No backends to test. Exiting.")
         return res
@@ -1676,7 +1703,7 @@ def testBatchMLAPagedAttentionWrapper(args):
     # Now initialize the page tables
     block_tables = torch.tensor(
         [
-            [k + i * num_pages_per_seq for k in range(num_pages_per_seq)]
+            [k + i * num_pages_per_seq for k in torch.randperm(num_pages_per_seq)]
             for i in range(batch_size)
         ],
         dtype=torch.int,
@@ -1723,11 +1750,7 @@ def testBatchMLAPagedAttentionWrapper(args):
     for i in range(len(kv_indptr) - 1):
         start_idx = kv_indptr[i]
         end_idx = kv_indptr[i + 1]
-        kv_indices[start_idx:end_idx] = torch.arange(
-            i * num_pages_per_seq,
-            i * num_pages_per_seq + (end_idx - start_idx),
-            device=device,
-        )
+        kv_indices[start_idx:end_idx] = block_tables[i, : end_idx - start_idx]
 
     sm_scale = 1.0 / ((128 + 64) ** 0.5)  # For DeepSeek-R1
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8, device=device)
@@ -1801,8 +1824,8 @@ def run_backend_wrapper(backend):
                 page_table=block_tables,
                 return_lse=False,
             )
-        if backend == "trtllm-gen-native":
-            return flashinfer.decode.trtllm_batch_decode_with_kv_cache_mla(
+        elif backend == "trtllm-native":
+            return flashinfer.mla.trtllm_batch_decode_with_kv_cache_mla(
                 query=q.unsqueeze(1),
                 kv_cache=kv_cache.unsqueeze(1),
                 workspace_buffer=workspace_buffer,
@@ -1822,6 +1845,8 @@ def run_backend_wrapper(backend):
     has_reference_output = False
     # Iterate over each backend:
     for cur_backend in backends:
+        # Clear workspace buffer to prevent unexpected interactions between backends.
+        workspace_buffer.zero_()
         if run_refcheck:
             outputs[cur_backend] = run_backend_wrapper(cur_backend).detach().clone()
             if cur_backend == "fa2":
diff --git a/benchmarks/routines/flashinfer_benchmark_utils.py b/benchmarks/routines/flashinfer_benchmark_utils.py
index fa1a527d17..d5f363839a 100644
--- a/benchmarks/routines/flashinfer_benchmark_utils.py
+++ b/benchmarks/routines/flashinfer_benchmark_utils.py
@@ -53,7 +53,6 @@
         "routed_scaling_factor",
         "local_expert_offset",
         "local_num_experts",
-        "tile_tokens_dim",
         "routing_method",
         "use_shuffled_weight",
         "weight_layout",
@@ -162,43 +161,47 @@ def dtype_str_to_torch_dtype(dtype_str):
 routine_cc_to_supported_backends = {
     # ATTENTION
     "BatchDecodeWithPagedKVCacheWrapper": {
+        # NOTE: trtllm-native calls trtllm_batch_decode_with_kv_cache
         "7.5": ["fa2"],
         "8.0": ["fa2", "fa2_tc", "cudnn"],
         "8.6": ["fa2", "fa2_tc", "cudnn"],
         "8.9": ["fa2", "fa2_tc", "cudnn"],
-        "9.0": ["fa2", "fa2_tc", "cudnn"],
-        "10.0": ["fa2", "fa2_tc", "cudnn", "trtllm-gen", "trtllm-gen-native"],
-        "10.3": ["fa2", "fa2_tc", "cudnn", "trtllm-gen", "trtllm-gen-native"],
-        "12.0": ["fa2", "fa2_tc", "cudnn"],
+        "9.0": ["fa2", "fa2_tc", "cudnn", "trtllm-native"],
+        "10.0": ["fa2", "fa2_tc", "cudnn", "trtllm-gen", "trtllm-native"],
+        "10.3": ["fa2", "fa2_tc", "cudnn", "trtllm-gen", "trtllm-native"],
+        "12.0": ["fa2", "fa2_tc", "cudnn", "trtllm-native"],
     },
     "BatchPrefillWithPagedKVCacheWrapper": {
+        # NOTE: trtllm-native calls trtllm_batch_context_with_kv_cache
         "7.5": [],
         "8.0": ["fa2", "cudnn"],
         "8.6": ["fa2", "cudnn"],
         "8.9": ["fa2", "cudnn"],
         "9.0": ["fa2", "fa3", "cudnn"],
-        "10.0": ["fa2", "cudnn", "trtllm-gen", "trtllm-gen-native"],
-        "10.3": ["fa2", "cudnn", "trtllm-gen", "trtllm-gen-native"],
+        "10.0": ["fa2", "cudnn", "trtllm-gen", "trtllm-native"],
+        "10.3": ["fa2", "cudnn", "trtllm-gen", "trtllm-native"],
         "12.0": ["fa2", "cudnn"],
     },
     "BatchPrefillWithRaggedKVCacheWrapper": {
+        # NOTE: trtllm-native calls trtllm_ragged_attention_deepseek
         "7.5": [],
         "8.0": ["fa2", "cudnn"],
         "8.6": ["fa2", "cudnn"],
         "8.9": ["fa2", "cudnn"],
         "9.0": ["fa2", "fa3", "cudnn"],
-        "10.0": ["fa2", "cudnn", "cutlass", "trtllm-gen-native"],
-        "10.3": ["fa2", "cudnn", "cutlass", "trtllm-gen-native"],
+        "10.0": ["fa2", "cudnn", "cutlass", "trtllm-native"],
+        "10.3": ["fa2", "cudnn", "cutlass", "trtllm-native"],
         "12.0": ["fa2", "cudnn"],
     },
     "BatchMLAPagedAttentionWrapper": {
+        # NOTE: trtllm-native calls trtllm_batch_decode_with_kv_cache_mla
         "7.5": [],
         "8.0": ["fa2"],
         "8.6": ["fa2"],
         "8.9": ["fa2"],
         "9.0": ["fa2", "fa3"],
-        "10.0": ["fa2", "cutlass", "trtllm-gen-native"],
-        "10.3": ["fa2", "cutlass", "trtllm-gen-native"],
+        "10.0": ["fa2", "cutlass", "trtllm-native"],
+        "10.3": ["fa2", "cutlass", "trtllm-native"],
         "12.0": ["fa2"],
     },
     # GEMM
@@ -232,16 +235,7 @@ def dtype_str_to_torch_dtype(dtype_str):
         "10.3": ["cudnn", "cublas", "cutlass"],
         "12.0": ["cudnn", "cublas"],
     },
-    "mm_fp4": {
-        "7.5": [],
-        "8.0": [],
-        "8.6": [],
-        "8.9": [],
-        "9.0": [],
-        "10.0": ["cudnn", "trtllm", "cutlass"],
-        "10.3": ["cudnn", "trtllm", "cutlass"],
-        "12.0": ["cudnn", "cutlass"],
-    },
+    # Note: mm_fp4 uses support checkers to filter backends, so it is not listed here
     # MOE
     "trtllm_fp4_block_scale_moe": {
         "7.5": [],
diff --git a/benchmarks/routines/gemm.py b/benchmarks/routines/gemm.py
index 17336189d0..9f95f17fb4 100644
--- a/benchmarks/routines/gemm.py
+++ b/benchmarks/routines/gemm.py
@@ -131,7 +131,7 @@ def parse_gemm_args(line, parser):
         required=False,
         nargs="+",
         default=["cudnn"],
-        choices=["cudnn", "cublas", "trtllm", "cutlass"],
+        choices=["cudnn", "cublas", "trtllm", "cutlass", "auto"],
         help="Kernel backends to test. Default: cudnn",
     )
     parser.add_argument(
@@ -790,61 +790,14 @@ def testMmFp4(args):
     run_refcheck = args.refcheck
     use_128x4_sf_layout = args.use_128x4_sf_layout
     use_nvfp4 = args.use_nvfp4
-    autotune_supported_backends = ["cutlass", "trtllm"]
+    autotune_supported_backends = ["cudnn", "cutlass", "trtllm", "auto"]
     res = []
 
-    backends = filter_backends_by_compute_capability(backends, args.routine, device)
-
     res_dtype = dtype_str_to_torch_dtype(args.out_dtype)
     if res_dtype not in [torch.bfloat16, torch.float16]:
         raise ValueError(
             f"Unsupported res dtype: {res_dtype}. Supported dtypes are bfloat16 and float16."
         )
-    ## Done parsing input arguments
-
-    if "trtllm" in backends:
-        remove_trtllm = False
-        if res_dtype == torch.float16:
-            print("[INFO] trtllm backend does not support float16 output")
-            remove_trtllm = True
-        if remove_trtllm:
-            backends.remove("trtllm")
-        if not use_nvfp4:
-            print(
-                "[INFO] trtllm backend does not support mxfp4 quantization (use_nvfp4=False)"
-            )
-            backends.remove("trtllm")
-    if "cutlass" in backends:
-        remove_cutlass = False
-        if not use_128x4_sf_layout:
-            print("[INFO] cutlass backend does not support use_128x4_sf_layout=False")
-            remove_cutlass = True
-        if not use_nvfp4:
-            print(
-                "[INFO] cutlass backend does not support mxfp4 quantization (use_nvfp4=False)"
-            )
-            backends.remove("cutlass")
-        if remove_cutlass:
-            backends.remove("cutlass")
-    if "cudnn" in backends:
-        remove_cudnn = False
-        if not use_128x4_sf_layout:
-            print("[INFO] cudnn backend does not support use_128x4_sf_layout=False")
-            remove_cudnn = True
-        if remove_cudnn:
-            backends.remove("cudnn")
-    if getattr(args, "autotune", False):
-        backends_to_remove = []
-        for cur_backend in backends:
-            if cur_backend not in autotune_supported_backends:
-                print(f"[INFO] {cur_backend} backend does not support autotune")
-                backends_to_remove.append(cur_backend)
-        for cur_backend in backends_to_remove:
-            backends.remove(cur_backend)
-
-    if len(backends) == 0:
-        print("[ERROR] No backends to test. Exiting.")
-        return
 
     input = torch.randn([m, k], device=device, dtype=torch.bfloat16)
     mat2 = torch.randn([n, k], device=device, dtype=torch.bfloat16)
@@ -886,11 +839,22 @@ def testMmFp4(args):
         print(f"[VVERBOSE] {mat2_fp4.dtype = }")
 
     alpha = 1.0 / (global_sf_input * global_sf_mat2) if use_nvfp4 else None
-    # res = torch.empty([m, n], device="cuda", dtype=res_dtype)
+    # Completed preparing inputs. Now programmatically filter backends
+    block_size = 16 if use_nvfp4 else 32
+    backends_to_remove = []
 
-    def run_backend(backend):
-        if backend in ["cudnn", "trtllm", "cutlass"]:
-            return flashinfer.gemm.mm_fp4(
+    for backend in backends:
+        # Skip autotune check for now (handled separately below)
+        if (
+            getattr(args, "autotune", False)
+            and backend not in autotune_supported_backends
+        ):
+            print(f"[INFO] {backend} backend does not support autotune")
+            backends_to_remove.append(backend)
+            continue
+
+        try:
+            flashinfer.gemm.mm_fp4(
                 a=input_fp4,
                 b=mat2_fp4.T if backend != "trtllm" else mat2_fp4_trtllm.T,
                 a_descale=input_inv_s,
@@ -904,6 +868,34 @@ def run_backend(backend):
                 backend=backend,
                 use_nvfp4=use_nvfp4,
             )
+        except Exception as e:
+            print(
+                f"[INFO] {backend} backend does not support this configuration: {type(e).__name__}: {e}"
+            )
+            backends_to_remove.append(backend)
+
+    # Remove unsupported backends
+    for backend in backends_to_remove:
+        backends.remove(backend)
+
+    if len(backends) == 0:
+        print("[ERROR] No backends passed validation. Exiting.")
+        return
+
+    def run_backend(backend):
+        if backend in ["cudnn", "trtllm", "cutlass", "auto"]:
+            return flashinfer.gemm.mm_fp4(
+                a=input_fp4,
+                b=mat2_fp4.T if backend != "trtllm" else mat2_fp4_trtllm.T,
+                a_descale=input_inv_s,
+                b_descale=mat2_inv_s.T if backend != "trtllm" else mat2_inv_s_trtllm.T,
+                alpha=alpha,
+                out_dtype=res_dtype,
+                block_size=block_size,
+                use_8x4_sf_layout=not use_128x4_sf_layout,
+                backend=backend,
+                use_nvfp4=use_nvfp4,
+            )
         else:
             raise ValueError(f"Unsupported backend: {backend}")
 
@@ -917,12 +909,11 @@ def run_backend(backend):
             args.dry_run_iters if args.dry_run_iters and args.dry_run_iters > 0 else 10
         )
         for cur_backend in backends:
-            if cur_backend in autotune_supported_backends:
-                if args.verbose >= 1:
-                    print(f"[INFO] Autotune warmup for mm_fp4: {warmup_iters} iters")
-                with autotune(True):
-                    for _ in range(warmup_iters):
-                        run_backend(cur_backend)
+            if args.verbose >= 1:
+                print(f"[INFO] Autotune warmup for mm_fp4: {warmup_iters} iters")
+            with autotune(True):
+                for _ in range(warmup_iters):
+                    run_backend(cur_backend)
 
     # Storage for timing results and outputs
     backend_times = {backend: [] for backend in backends}
diff --git a/benchmarks/routines/moe.py b/benchmarks/routines/moe.py
index 6af3425c73..8f26bdb8f7 100644
--- a/benchmarks/routines/moe.py
+++ b/benchmarks/routines/moe.py
@@ -116,13 +116,6 @@ def parse_moe_args(line, parser):
         default=None,
         help="Number of experts handled by this device. Defaults to num_experts.",
     )
-    parser.add_argument(
-        "--tile_tokens_dim",
-        type=int,
-        required=False,
-        default=8,
-        help="Tile dimension for tokens.",
-    )
     parser.add_argument(
         "--routing_method",
         type=str,
@@ -560,7 +553,6 @@ def testTrtllmFp4BlockScaleMoe(args):
     )
     local_expert_offset = args.local_expert_offset
     local_num_experts = args.local_num_experts or num_experts
-    tile_tokens_dim = args.tile_tokens_dim
     routing_method_type = args.routing_method_type
     use_shuffled_weight = args.use_shuffled_weight
     weight_layout = args.weight_layout
@@ -705,7 +697,6 @@ def run_fp4_moe():
             local_expert_offset=local_expert_offset,
             local_num_experts=local_num_experts,
             routed_scaling_factor=routed_scaling_factor,
-            tile_tokens_dim=tile_tokens_dim,
             routing_method_type=routing_method_type,
             gated_act_type=gated_act_type,
             do_finalize=True,
@@ -780,7 +771,6 @@ def run_fp4_moe():
         cur_res["routed_scaling_factor"] = routed_scaling_factor
         cur_res["local_expert_offset"] = local_expert_offset
         cur_res["local_num_experts"] = local_num_experts
-        cur_res["tile_tokens_dim"] = tile_tokens_dim
         cur_res["routing_method"] = args.routing_method
         cur_res["use_shuffled_weight"] = use_shuffled_weight
         cur_res["weight_layout"] = weight_layout
@@ -1185,7 +1175,6 @@ def testTrtllmFp8BlockScaleMoe(args):
     )
     local_expert_offset = args.local_expert_offset
     local_num_experts = args.local_num_experts or num_experts
-    tile_tokens_dim = args.tile_tokens_dim
     routing_method_type = args.routing_method_type
     use_shuffled_weight = args.use_shuffled_weight
     weight_layout = args.weight_layout
@@ -1277,27 +1266,6 @@ def testTrtllmFp8BlockScaleMoe(args):
         print(f"[VVERBOSE] gemm1_weights_fp8.shape = {gemm1_weights_fp8.shape}")
         print(f"[VVERBOSE] gemm2_weights_fp8.shape = {gemm2_weights_fp8.shape}")
 
-    # Match test heuristic for tile_tokens_dim when using BlockMajorK
-    if use_shuffled_weight and weight_layout == WeightLayout.BlockMajorK:
-
-        def _next_pow2(x: int) -> int:
-            x = max(1, x)
-            x -= 1
-            x |= x >> 1
-            x |= x >> 2
-            x |= x >> 4
-            x |= x >> 8
-            x |= x >> 16
-            return x + 1
-
-        tokens_per_expert = max(1, (num_tokens * top_k) // max(local_num_experts, 1))
-        suggested_tile = min(max(_next_pow2(tokens_per_expert), 8), 64)
-        if suggested_tile != tile_tokens_dim and args.verbose >= 1:
-            print(
-                f"[INFO] Overriding tile_tokens_dim {tile_tokens_dim} -> {suggested_tile} for BlockMajorK"
-            )
-        tile_tokens_dim = suggested_tile
-
     def run_fp8_block_moe():
         # Quantize hidden states to FP8 for block scale MOE
         hidden_states_fp8 = hidden_states.to(torch.float8_e4m3fn)
@@ -1320,7 +1288,6 @@ def run_fp8_block_moe():
             local_expert_offset=local_expert_offset,
             local_num_experts=local_num_experts,
             routed_scaling_factor=routed_scaling_factor,
-            tile_tokens_dim=tile_tokens_dim,
             routing_method_type=routing_method_type,
             use_shuffled_weight=use_shuffled_weight,
             weight_layout=weight_layout,
@@ -1381,7 +1348,6 @@ def run_fp8_block_moe():
         cur_res["routed_scaling_factor"] = routed_scaling_factor
         cur_res["local_expert_offset"] = local_expert_offset
         cur_res["local_num_experts"] = local_num_experts
-        cur_res["tile_tokens_dim"] = tile_tokens_dim
         cur_res["routing_method"] = args.routing_method
         cur_res["use_shuffled_weight"] = use_shuffled_weight
         cur_res["weight_layout"] = weight_layout
@@ -1448,7 +1414,6 @@ def testTrtllmFp8PerTensorScaleMoe(args):
     )
     local_expert_offset = args.local_expert_offset
     local_num_experts = args.local_num_experts or num_experts
-    tile_tokens_dim = args.tile_tokens_dim
     routing_method_type = args.routing_method_type
     use_routing_scales_on_input = args.use_routing_scales_on_input
     is_cuda_graph_compatible = not args.no_cuda_graph
@@ -1527,7 +1492,6 @@ def run_fp8_per_tensor_moe():
             local_num_experts=local_num_experts,
             routed_scaling_factor=routed_scaling_factor,
             use_routing_scales_on_input=use_routing_scales_on_input,
-            tile_tokens_dim=tile_tokens_dim,
             routing_method_type=routing_method_type,
         )
 
@@ -1585,7 +1549,6 @@ def run_fp8_per_tensor_moe():
         cur_res["routed_scaling_factor"] = routed_scaling_factor
         cur_res["local_expert_offset"] = local_expert_offset
         cur_res["local_num_experts"] = local_num_experts
-        cur_res["tile_tokens_dim"] = tile_tokens_dim
         cur_res["routing_method"] = args.routing_method
         cur_res["use_routing_bias"] = args.use_routing_bias
         cur_res["use_routing_scales_on_input"] = use_routing_scales_on_input
diff --git a/benchmarks/samples/sample_testlist_output.csv b/benchmarks/samples/sample_testlist_output.csv
index d856d37ab0..b07c523ecb 100644
--- a/benchmarks/samples/sample_testlist_output.csv
+++ b/benchmarks/samples/sample_testlist_output.csv
@@ -1,4 +1,4 @@
-routine,median_time,std_time,tflops,tb_per_sec,backend,page_size,batch_size,s_qo,s_kv,num_qo_heads,num_kv_heads,head_dim_qk,head_dim_vo,head_dim_ckv,head_dim_kpe,causal,q_dtype,kv_dtype,avg_actual_seq_len,random_actual_seq_len,m,n,k,group_size,tile_size,scale_major_mode,out_dtype,mma_sm,use_128x4_sf_layout,use_nvfp4,num_tokens,hidden_size,intermediate_size,num_experts,top_k,n_group,topk_group,routed_scaling_factor,local_expert_offset,local_num_experts,tile_tokens_dim,routing_method,use_shuffled_weight,weight_layout,use_routing_bias,use_routing_scales_on_input,input_dtype,weight_dtype,gated_act,cutlass_variant,quantized_input,tp_size,tp_rank,ep_size,ep_rank,refcheck,no_cuda_graph,use_cupti,allow_output_mismatch,random_seed,case_tag,generate_repro_command,repro_command
+routine,median_time,std_time,tflops,tb_per_sec,backend,page_size,batch_size,s_qo,s_kv,num_qo_heads,num_kv_heads,head_dim_qk,head_dim_vo,head_dim_ckv,head_dim_kpe,causal,q_dtype,kv_dtype,avg_actual_seq_len,random_actual_seq_len,m,n,k,group_size,tile_size,scale_major_mode,out_dtype,mma_sm,use_128x4_sf_layout,use_nvfp4,num_tokens,hidden_size,intermediate_size,num_experts,top_k,n_group,topk_group,routed_scaling_factor,local_expert_offset,local_num_experts,routing_method,use_shuffled_weight,weight_layout,use_routing_bias,use_routing_scales_on_input,input_dtype,weight_dtype,gated_act,cutlass_variant,quantized_input,tp_size,tp_rank,ep_size,ep_rank,refcheck,no_cuda_graph,use_cupti,allow_output_mismatch,random_seed,case_tag,generate_repro_command,repro_command
 BatchPrefillWithPagedKVCacheWrapper,0.01244799979031086,0.0009464459008260536,13.963516944729905,0.3050282827732261,fa2,16,1,1024,1024,64,8,128,128,,,True,torch.bfloat16,torch.bfloat16,103,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,False,False,True,42,Llama-3.1-70B,True,python3 flashinfer_benchmark.py --routine BatchPrefillWithPagedKVCacheWrapper --backends fa2 fa3 cudnn trtllm-gen --page_size 16 --batch_size 1 --s_qo 1024 --s_kv 1024 --num_qo_heads 64 --num_kv_heads 8 --head_dim_qk 128 --head_dim_vo 128 --random_actual_seq_len -vv --refcheck --causal --q_dtype bfloat16 --kv_dtype bfloat16 --allow_output_mismatch --generate_repro_command --case_tag Llama-3.1-70B
 BatchPrefillWithPagedKVCacheWrapper,0.01839040070772171,0.00021363710731210026,9.45155349045863,0.20646597430613514,cudnn,16,1,1024,1024,64,8,128,128,,,True,torch.bfloat16,torch.bfloat16,103,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,False,False,True,42,Llama-3.1-70B,True,python3 flashinfer_benchmark.py --routine BatchPrefillWithPagedKVCacheWrapper --backends fa2 fa3 cudnn trtllm-gen --page_size 16 --batch_size 1 --s_qo 1024 --s_kv 1024 --num_qo_heads 64 --num_kv_heads 8 --head_dim_qk 128 --head_dim_vo 128 --random_actual_seq_len -vv --refcheck --causal --q_dtype bfloat16 --kv_dtype bfloat16 --allow_output_mismatch --generate_repro_command --case_tag Llama-3.1-70B
 BatchPrefillWithPagedKVCacheWrapper,0.008396799862384795,5.550615129103214e-05,20.70048814413847,0.45219512936224815,trtllm-gen,16,1,1024,1024,64,8,128,128,,,True,torch.bfloat16,torch.bfloat16,103,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,False,False,True,42,Llama-3.1-70B,True,python3 flashinfer_benchmark.py --routine BatchPrefillWithPagedKVCacheWrapper --backends fa2 fa3 cudnn trtllm-gen --page_size 16 --batch_size 1 --s_qo 1024 --s_kv 1024 --num_qo_heads 64 --num_kv_heads 8 --head_dim_qk 128 --head_dim_vo 128 --random_actual_seq_len -vv --refcheck --causal --q_dtype bfloat16 --kv_dtype bfloat16 --allow_output_mismatch --generate_repro_command --case_tag Llama-3.1-70B
diff --git a/benchmarks/samples/sample_testlist_output.txt b/benchmarks/samples/sample_testlist_output.txt
index 69a3961f87..d2c5cc4fa1 100644
--- a/benchmarks/samples/sample_testlist_output.txt
+++ b/benchmarks/samples/sample_testlist_output.txt
@@ -292,7 +292,7 @@
 2025-09-23 00:32:18,247 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends
 [PERF] cutlass_autotun:: median time 0.009 ms; std 0.000 ms; achieved tflops 6.372 TFLOPs/sec; achieved tb_per_sec 0.401 TB/sec
 [PERF] trtllm_autotune:: median time 0.011 ms; std 0.000 ms; achieved tflops 5.410 TFLOPs/sec; achieved tb_per_sec 0.340 TB/sec
-[INFO] args = Namespace(routine='trtllm_fp4_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=256, top_k=8, n_group=8, topk_group=4, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, tile_tokens_dim=8, routing_method='deepseek_v3', use_shuffled_weight=True, weight_layout=0, use_routing_bias=True, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=2, gated_act_type=0)
+[INFO] args = Namespace(routine='trtllm_fp4_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=256, top_k=8, n_group=8, topk_group=4, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, routing_method='deepseek_v3', use_shuffled_weight=True, weight_layout=0, use_routing_bias=True, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=2, gated_act_type=0)
 [INFO] Running testTrtllmFp4BlockScaleMoe
 [INFO] FlashInfer version: 0.3.1
 [VVERBOSE] gpu_name = 'NVIDIA_B200'
@@ -303,7 +303,7 @@
 [VVERBOSE] gemm1_weights_fp4.shape = torch.Size([256, 2048, 512])
 [VVERBOSE] gemm2_weights_fp4.shape = torch.Size([256, 1024, 512])
 [PERF] trtllm         :: median time 0.224 ms; std 0.000 ms; achieved tflops 230.555 TFLOPs/sec; achieved tb_per_sec 1.818 TB/sec
-[INFO] args = Namespace(routine='trtllm_fp4_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=128, top_k=8, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, tile_tokens_dim=8, routing_method='renormalize_naive', use_shuffled_weight=True, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=4, gated_act_type=0)
+[INFO] args = Namespace(routine='trtllm_fp4_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=128, top_k=8, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, routing_method='renormalize_naive', use_shuffled_weight=True, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=4, gated_act_type=0)
 [INFO] Running testTrtllmFp4BlockScaleMoe
 [INFO] FlashInfer version: 0.3.1
 [VVERBOSE] gpu_name = 'NVIDIA_B200'
@@ -314,7 +314,7 @@
 [VVERBOSE] gemm1_weights_fp4.shape = torch.Size([128, 2048, 512])
 [VVERBOSE] gemm2_weights_fp4.shape = torch.Size([128, 1024, 512])
 [PERF] trtllm         :: median time 0.226 ms; std 0.000 ms; achieved tflops 227.846 TFLOPs/sec; achieved tb_per_sec 0.903 TB/sec
-[INFO] args = Namespace(routine='trtllm_fp8_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=256, top_k=8, n_group=8, topk_group=4, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, tile_tokens_dim=8, routing_method='deepseek_v3', use_shuffled_weight=True, weight_layout=0, use_routing_bias=True, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=2, gated_act_type=0)
+[INFO] args = Namespace(routine='trtllm_fp8_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=256, top_k=8, n_group=8, topk_group=4, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, routing_method='deepseek_v3', use_shuffled_weight=True, weight_layout=0, use_routing_bias=True, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=2, gated_act_type=0)
 [INFO] Running testTrtllmFp8BlockScaleMoe
 [INFO] FlashInfer version: 0.3.1
 [VVERBOSE] gpu_name = 'NVIDIA_B200'
@@ -325,7 +325,7 @@
 [VVERBOSE] gemm1_weights_fp8.shape = torch.Size([256, 2048, 1024])
 [VVERBOSE] gemm2_weights_fp8.shape = torch.Size([256, 1024, 1024])
 [PERF] trtllm         :: median time 0.557 ms; std 0.000 ms; achieved tflops 92.607 TFLOPs/sec; achieved tb_per_sec 1.455 TB/sec
-[INFO] args = Namespace(routine='trtllm_fp8_per_tensor_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=128, top_k=1, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, tile_tokens_dim=8, routing_method='llama4', use_shuffled_weight=False, weight_layout=0, use_routing_bias=True, use_routing_scales_on_input=True, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=3, gated_act_type=0)
+[INFO] args = Namespace(routine='trtllm_fp8_per_tensor_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=128, top_k=1, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, routing_method='llama4', use_shuffled_weight=False, weight_layout=0, use_routing_bias=True, use_routing_scales_on_input=True, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=3, gated_act_type=0)
 [INFO] Running testTrtllmFp8PerTensorScaleMoe
 [INFO] FlashInfer version: 0.3.1
 [VVERBOSE] gpu_name = 'NVIDIA_B200'
@@ -336,7 +336,7 @@
 [VVERBOSE] gemm1_weights_fp8.shape = torch.Size([128, 2048, 1024])
 [VVERBOSE] gemm2_weights_fp8.shape = torch.Size([128, 1024, 1024])
 [PERF] trtllm         :: median time 0.123 ms; std 0.000 ms; achieved tflops 52.340 TFLOPs/sec; achieved tb_per_sec 3.299 TB/sec
-[INFO] args = Namespace(routine='trtllm_fp8_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=128, top_k=1, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, tile_tokens_dim=8, routing_method='renormalize', use_shuffled_weight=True, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=1, gated_act_type=0)
+[INFO] args = Namespace(routine='trtllm_fp8_block_scale_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='trtllm_moe_sample', generate_repro_command=True, repro_command='', num_tokens=1024, hidden_size=1024, intermediate_size=1024, num_experts=128, top_k=1, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, routing_method='renormalize', use_shuffled_weight=True, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='bfloat16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=1, gated_act_type=0)
 [INFO] Running testTrtllmFp8BlockScaleMoe
 [INFO] FlashInfer version: 0.3.1
 [VVERBOSE] gpu_name = 'NVIDIA_B200'
@@ -347,7 +347,7 @@
 [VVERBOSE] gemm1_weights_fp8.shape = torch.Size([128, 2048, 1024])
 [VVERBOSE] gemm2_weights_fp8.shape = torch.Size([128, 1024, 1024])
 [PERF] trtllm         :: median time 0.109 ms; std 0.000 ms; achieved tflops 59.297 TFLOPs/sec; achieved tb_per_sec 3.740 TB/sec
-[INFO] args = Namespace(routine='cutlass_fused_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='cutlass_moe_base', generate_repro_command=True, repro_command='', num_tokens=32, hidden_size=128, intermediate_size=128, num_experts=2, top_k=2, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, tile_tokens_dim=8, routing_method='deepseek_v3', use_shuffled_weight=False, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='float16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=2, gated_act_type=0)
+[INFO] args = Namespace(routine='cutlass_fused_moe', no_cuda_graph=False, use_cupti=False, refcheck=False, allow_output_mismatch=False, random_seed=42, verbose=2, output_path=None, num_iters=30, dry_run_iters=5, case_tag='cutlass_moe_base', generate_repro_command=True, repro_command='', num_tokens=32, hidden_size=128, intermediate_size=128, num_experts=2, top_k=2, n_group=None, topk_group=None, routed_scaling_factor=2.5, local_expert_offset=0, local_num_experts=None, routing_method='deepseek_v3', use_shuffled_weight=False, weight_layout=0, use_routing_bias=False, use_routing_scales_on_input=False, input_dtype='float16', weight_dtype='bfloat16', gated_act='swiglu', autotune=False, cutlass_variant='base', quantized_input=False, tp_size=1, tp_rank=0, ep_size=1, ep_rank=0, routing_method_type=2, gated_act_type=0)
 [INFO] Running testCutlassFusedMoe
 [INFO] FlashInfer version: 0.3.1
 [VVERBOSE] gpu_name = 'NVIDIA_B200'
diff --git a/ci/docker-tags.yml b/ci/docker-tags.yml
index ba3a947bc6..36fe4a6920 100644
--- a/ci/docker-tags.yml
+++ b/ci/docker-tags.yml
@@ -1,4 +1,4 @@
-flashinfer/flashinfer-ci-cu126: 20251024-0e48aaf
-flashinfer/flashinfer-ci-cu128: 20251024-0e48aaf
-flashinfer/flashinfer-ci-cu129: 20251024-0e48aaf
-flashinfer/flashinfer-ci-cu130: 20251024-0e48aaf
+flashinfer/flashinfer-ci-cu126: 20251206-185d63a
+flashinfer/flashinfer-ci-cu128: 20251206-185d63a
+flashinfer/flashinfer-ci-cu129: 20251206-185d63a
+flashinfer/flashinfer-ci-cu130: 20251206-185d63a
diff --git a/csrc/batch_attention.cu b/csrc/batch_attention.cu
index a3d36b7981..b37a9a6a18 100644
--- a/csrc/batch_attention.cu
+++ b/csrc/batch_attention.cu
@@ -48,7 +48,7 @@ Array<int64_t> BatchPagedAttentionPlan(TensorView float_workspace_buffer,
 
   HolisticPlanInfo<2> plan_info;
 
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
 
   cudaError_t status = TwoStageHolisticPlan<IdType>(
@@ -102,7 +102,7 @@ void BatchPagedAttentionRun(TensorView float_workspace_buffer, TensorView int_wo
     v_stride_n = v_cache.stride(2);
   }
 
-  cudaSetDevice(q.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q.device().device_id);
   const cudaStream_t stream = get_stream(q.device());
 
   DISPATCH_context(
diff --git a/csrc/batch_decode.cu b/csrc/batch_decode.cu
index c3ce1e2ecf..8cc31fbe01 100644
--- a/csrc/batch_decode.cu
+++ b/csrc/batch_decode.cu
@@ -42,6 +42,8 @@ Array<int64_t> BatchDecodeWithPagedKVCachePlan(
     int64_t num_qo_heads, int64_t num_kv_heads, int64_t page_size, bool enable_cuda_graph,
     int64_t window_left, double logits_soft_cap, int64_t head_dim_qk, int64_t head_dim_vo,
     TensorView empty_q_data, TensorView empty_kv_data) {
+  CHECK_INPUT_TYPE(indptr, dl_int32);
+
   size_t float_workspace_size_in_bytes =
       float_workspace_buffer.size(0) * get_element_size(float_workspace_buffer);
   size_t int_workspace_size_in_bytes =
@@ -53,7 +55,7 @@ Array<int64_t> BatchDecodeWithPagedKVCachePlan(
       << "CUDA cores template only supports equal head dim for QK and VO, please use tensor "
          "cores template for different head dim";
 
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
   DISPATCH_context(
       DTypeQ, DTypeKV, DTypeO, IdType, HEAD_DIM_QK, HEAD_DIM_VO, POS_ENCODING_MODE,
@@ -86,6 +88,10 @@ void BatchDecodeWithPagedKVCacheRun(TensorView float_workspace_buffer,
                                     TensorView o, Optional<TensorView> maybe_lse,
                                     int64_t kv_layout_code, int64_t window_left,
                                     bool enable_pdl ADDITIONAL_FUNC_PARAMS) {
+  CHECK_INPUT_TYPE(paged_kv_indptr, dl_int32);
+  CHECK_INPUT_TYPE(paged_kv_indices, dl_int32);
+  CHECK_INPUT_TYPE(paged_kv_last_page_len, dl_int32);
+
   DecodePlanInfo plan_info;
   plan_info.FromVector(std::vector<int64_t>(plan_info_vec.begin(), plan_info_vec.end()));
   QKVLayout kv_layout = static_cast<QKVLayout>(kv_layout_code);
@@ -130,7 +136,7 @@ void BatchDecodeWithPagedKVCacheRun(TensorView float_workspace_buffer,
   }
   kv_cache_strides = k_strides.data();
 
-  cudaSetDevice(q.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q.device().device_id);
   const cudaStream_t stream = get_stream(q.device());
 
   DISPATCH_context(
diff --git a/csrc/batch_decode_mla_cute_sm80.cu b/csrc/batch_decode_mla_cute_sm80.cu
index 5679076438..45b708018f 100644
--- a/csrc/batch_decode_mla_cute_sm80.cu
+++ b/csrc/batch_decode_mla_cute_sm80.cu
@@ -23,7 +23,7 @@ Array<int64_t> BatchDecodeWithPagedKVCachePlanMLA(ffi::TensorView float_workspac
       int_workspace_buffer.size(0) * get_element_size(int_workspace_buffer);
 
   DecodePlanInfo plan_info;
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
 
   auto work_estimation_func = BatchDecodeWithPagedKVCacheWorkEstimationDispatchedMlaCuteSM80<
@@ -103,7 +103,7 @@ void BatchDecodeWithPagedKVCacheRunMLA(
   }
   params.padded_batch_size = plan_info.padded_batch_size;
 
-  cudaSetDevice(paged_ckv_cache.device().device_id);
+  ffi::CUDADeviceGuard device_guard(paged_ckv_cache.device().device_id);
   const cudaStream_t stream = get_stream(paged_ckv_cache.device());
   cudaError_t status = BatchDecodeWithPagedKVCacheDispatchedMlaCuteSM80<HEAD_DIM_CKV, HEAD_DIM_KPE,
                                                                         QO_TILE_LEN, Params>(
diff --git a/csrc/batch_decode_mla_plan.cu b/csrc/batch_decode_mla_plan.cu
index 7925a14f27..e409cde882 100644
--- a/csrc/batch_decode_mla_plan.cu
+++ b/csrc/batch_decode_mla_plan.cu
@@ -15,7 +15,9 @@ Array<int64_t> BatchDecodeWithPagedKVCachePlanMLA(TensorView float_workspace_buf
                                                   TensorView indptr, int64_t batch_size,
                                                   int64_t num_qo_heads, int64_t page_size,
                                                   bool enable_cuda_graph) {
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  CHECK_INPUT_TYPE(indptr, dl_int32);
+
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
 
   size_t float_workspace_size_in_bytes =
diff --git a/csrc/batch_decode_mla_run.cu b/csrc/batch_decode_mla_run.cu
index 35d533b536..94b5e35e0b 100644
--- a/csrc/batch_decode_mla_run.cu
+++ b/csrc/batch_decode_mla_run.cu
@@ -17,6 +17,10 @@ void BatchDecodeWithPagedKVCacheRunMLA(
     TensorView paged_kv_last_page_len, TensorView o, double sm_scale, int64_t window_left,
     double logits_soft_cap, double rope_scale, double rope_theta, Optional<TensorView> maybe_lse,
     bool enable_pdl) {
+  CHECK_INPUT_TYPE(paged_kv_indptr, dl_int32);
+  CHECK_INPUT_TYPE(paged_kv_indices, dl_int32);
+  CHECK_INPUT_TYPE(paged_kv_last_page_len, dl_int32);
+
   DecodePlanInfo plan_info;
   plan_info.FromVector(std::vector<int64_t>(plan_info_vec.begin(), plan_info_vec.end()));
 
@@ -35,7 +39,7 @@ void BatchDecodeWithPagedKVCacheRunMLA(
   void* float_buffer = static_cast<void*>(float_workspace_buffer.data_ptr());
   void* int_buffer = static_cast<void*>(int_workspace_buffer.data_ptr());
 
-  cudaSetDevice(q_nope.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q_nope.device().device_id);
   const cudaStream_t stream = get_stream(q_nope.device());
 
   paged_kv_mla_t<DTypeKV, IdType> paged_kv(
diff --git a/csrc/batch_mla_binding.cu b/csrc/batch_mla_binding.cu
index 6822e28b93..b39192de6a 100644
--- a/csrc/batch_mla_binding.cu
+++ b/csrc/batch_mla_binding.cu
@@ -31,7 +31,8 @@ void BatchMLAPagedAttentionRun(TensorView float_workspace_buffer, TensorView int
                                Array<int64_t> plan_info_vec, TensorView q_nope, TensorView q_pe,
                                TensorView ckv_cache, TensorView kpe_cache, TensorView kv_indices,
                                TensorView o, Optional<TensorView> maybe_lse, int64_t mask_mode_code,
-                               int64_t num_heads, int64_t page_size, double sm_scale);
+                               int64_t num_heads, int64_t page_size, double sm_scale,
+                               bool return_lse_base_on_e);
 
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(plan, BatchMLAPagedAttentionPlan);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(run, BatchMLAPagedAttentionRun);
diff --git a/csrc/batch_mla_plan.cu b/csrc/batch_mla_plan.cu
index 1f7176e452..f4e8bc4bda 100644
--- a/csrc/batch_mla_plan.cu
+++ b/csrc/batch_mla_plan.cu
@@ -29,6 +29,10 @@ Array<int64_t> BatchMLAPagedAttentionPlan(TensorView float_workspace_buffer,
                                           TensorView qo_indptr, TensorView kv_indptr,
                                           TensorView kv_len, int64_t num_heads, int64_t head_dim_o,
                                           bool causal) {
+  CHECK_INPUT_TYPE(qo_indptr, dl_int32);
+  CHECK_INPUT_TYPE(kv_indptr, dl_int32);
+  CHECK_INPUT_TYPE(kv_len, dl_int32);
+
   size_t float_workspace_size_in_bytes =
       float_workspace_buffer.size(0) * get_element_size(float_workspace_buffer);
   size_t int_workspace_size_in_bytes =
@@ -38,7 +42,7 @@ Array<int64_t> BatchMLAPagedAttentionPlan(TensorView float_workspace_buffer,
 
   int batch_size = kv_len.size(0);
 
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
 
   cudaError_t status =
diff --git a/csrc/batch_mla_run.cu b/csrc/batch_mla_run.cu
index dfa2442f1b..3dc142d1f4 100644
--- a/csrc/batch_mla_run.cu
+++ b/csrc/batch_mla_run.cu
@@ -31,11 +31,14 @@ void BatchMLAPagedAttentionRun(TensorView float_workspace_buffer, TensorView int
                                Array<int64_t> plan_info_vec, TensorView q_nope, TensorView q_pe,
                                TensorView ckv_cache, TensorView kpe_cache, TensorView kv_indices,
                                TensorView o, Optional<TensorView> maybe_lse, int64_t mask_mode_code,
-                               int64_t num_heads, int64_t page_size, double sm_scale) {
+                               int64_t num_heads, int64_t page_size, double sm_scale,
+                               bool return_lse_base_on_e) {
   // q_nope: [n, num_heads, head_dim_ckv]
   // q_pe: [n, num_heads, head_dim_kpe]
   // ckv_cache: [num_pages, page_size, head_dim_ckv]
   // kpe_cache: [num_pages, page_size, head_dim_kpe]
+  CHECK_INPUT_TYPE(kv_indices, dl_int32);
+
   MLAPlanInfo plan_info;
   plan_info.FromVector(std::vector<int64_t>(plan_info_vec.begin(), plan_info_vec.end()));
 
@@ -55,7 +58,7 @@ void BatchMLAPagedAttentionRun(TensorView float_workspace_buffer, TensorView int
   unsigned int o_stride_n = o.stride(0);
   unsigned int o_stride_h = o.stride(1);
 
-  cudaSetDevice(q_nope.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q_nope.device().device_id);
   const cudaStream_t stream = get_stream(q_nope.device());
 
   DISPATCH_context(
@@ -112,6 +115,7 @@ void BatchMLAPagedAttentionRun(TensorView float_workspace_buffer, TensorView int
         params.o_stride_h = o_stride_h;
 
         params.sm_scale = sm_scale;
+        params.return_lse_base_on_e = return_lse_base_on_e;
 
         cudaError_t status = mla::BatchMLAPagedAttention<MASK_MODE, HEAD_DIM_CKV, HEAD_DIM_KPE>(
             params, plan_info.num_blks_x, plan_info.num_blks_y, stream);
diff --git a/csrc/batch_mla_sm90_binding.cu b/csrc/batch_mla_sm90_binding.cu
index 2e6cd1aa7d..f2af49766a 100644
--- a/csrc/batch_mla_sm90_binding.cu
+++ b/csrc/batch_mla_sm90_binding.cu
@@ -32,8 +32,8 @@ void BatchMLAPagedAttentionSM90Run(TensorView float_workspace_buffer,
                                    TensorView q_nope, TensorView q_pe, TensorView ckv_cache,
                                    TensorView kpe_cache, TensorView kv_indices, TensorView o,
                                    Optional<TensorView> maybe_lse, int64_t mask_mode_code,
-                                   int64_t num_heads, int64_t page_size,
-                                   double sm_scale ADDITIONAL_FUNC_PARAMS);
+                                   int64_t num_heads, int64_t page_size, double sm_scale,
+                                   bool return_lse_base_on_e ADDITIONAL_FUNC_PARAMS);
 
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(plan, BatchMLAPagedAttentionSM90Plan);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(run, BatchMLAPagedAttentionSM90Run);
diff --git a/csrc/batch_mla_sm90_plan.cu b/csrc/batch_mla_sm90_plan.cu
index d297ebab90..e51932e64b 100644
--- a/csrc/batch_mla_sm90_plan.cu
+++ b/csrc/batch_mla_sm90_plan.cu
@@ -38,7 +38,7 @@ Array<int64_t> BatchMLAPagedAttentionSM90Plan(TensorView float_workspace_buffer,
 
   int batch_size = kv_len.size(0);
 
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
 
   cudaError_t status =
diff --git a/csrc/batch_mla_sm90_run.cu b/csrc/batch_mla_sm90_run.cu
index 8d6d80c223..b47a7ff7dc 100644
--- a/csrc/batch_mla_sm90_run.cu
+++ b/csrc/batch_mla_sm90_run.cu
@@ -31,8 +31,8 @@ void BatchMLAPagedAttentionSM90Run(TensorView float_workspace_buffer,
                                    TensorView q_nope, TensorView q_pe, TensorView ckv_cache,
                                    TensorView kpe_cache, TensorView kv_indices, TensorView o,
                                    Optional<TensorView> maybe_lse, int64_t mask_mode_code,
-                                   int64_t num_heads, int64_t page_size,
-                                   double sm_scale ADDITIONAL_FUNC_PARAMS) {
+                                   int64_t num_heads, int64_t page_size, double sm_scale,
+                                   bool return_lse_base_on_e ADDITIONAL_FUNC_PARAMS) {
   // q_nope: [n, num_heads, head_dim_ckv]
   // q_pe: [n, num_heads, head_dim_kpe]
   // ckv_cache: [num_pages, page_size, head_dim_ckv]
@@ -56,7 +56,7 @@ void BatchMLAPagedAttentionSM90Run(TensorView float_workspace_buffer,
   unsigned int o_stride_n = o.stride(0);
   unsigned int o_stride_h = o.stride(1);
 
-  cudaSetDevice(q_nope.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q_nope.device().device_id);
   const cudaStream_t stream = get_stream(q_nope.device());
 
   DISPATCH_context(
@@ -111,6 +111,7 @@ void BatchMLAPagedAttentionSM90Run(TensorView float_workspace_buffer,
         params.kpe_stride_n = kpe_stride_n;
         params.o_stride_n = o_stride_n;
         params.o_stride_h = o_stride_h;
+        params.return_lse_base_on_e = return_lse_base_on_e;
 
         ADDITIONAL_PARAMS_SETTER
 
diff --git a/csrc/batch_pod.cu b/csrc/batch_pod.cu
new file mode 100644
index 0000000000..98ff9d83da
--- /dev/null
+++ b/csrc/batch_pod.cu
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2023 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <flashinfer/attention/variants.cuh>
+#include <flashinfer/pos_enc.cuh>
+
+#include "batch_pod_config.inc"
+#include "tvm_ffi_utils.h"
+
+namespace flashinfer {
+template <uint32_t HEAD_DIM_QK, uint32_t HEAD_DIM_VO, PosEncodingMode POS_ENCODING_MODE,
+          bool USE_FP16_QK_REDUCTION, uint32_t CTA_TILE_Q_P, MaskMode MASK_MODE_P,
+          uint32_t CTA_TILE_Q_D, MaskMode MASK_MODE_D, typename PrefillAttentionVariant,
+          typename DecodeAttentionVariant, typename PrefillParams, typename DecodeParams>
+cudaError_t BatchPODWithKVCacheTensorDispatched(PrefillParams prefill_params,
+                                                typename PrefillParams::DTypeO* tmp_v_p,
+                                                float* tmp_s_p, DecodeParams decode_params,
+                                                typename DecodeParams::DTypeO* tmp_v_d,
+                                                float* tmp_s_d, bool enable_pdl,
+                                                cudaStream_t stream, int* sm_aware_sched);
+
+}  // namespace flashinfer
+
+using namespace flashinfer;
+
+using tvm::ffi::Array;
+using tvm::ffi::Optional;
+
+void batch_pod_with_kv_cache_tensor(
+    // Prefill params
+    TensorView float_workspace_buffer_p, TensorView int_workspace_buffer_p,
+    Array<int64_t> plan_info_vec_p, TensorView q_p, TensorView paged_k_cache_p,
+    TensorView paged_v_cache_p, TensorView qo_indptr_p, TensorView paged_kv_indptr_p,
+    TensorView paged_kv_indices_p, TensorView paged_kv_last_page_len_p, TensorView o_p,
+    Optional<TensorView> maybe_lse_p, int64_t mask_mode_code_p, int64_t layout_p,
+    int64_t window_left_p, Optional<TensorView> maybe_custom_mask_p,
+    Optional<TensorView> maybe_mask_indptr_p, Optional<TensorView> maybe_alibi_slopes_p,
+    double logits_soft_cap_p, double sm_scale_p, double rope_rcp_scale_p, double rope_rcp_theta_p,
+    // Decode params
+    TensorView float_workspace_buffer_d, TensorView int_workspace_buffer_d,
+    Array<int64_t> plan_info_vec_d, TensorView q_d, TensorView paged_k_cache_d,
+    TensorView paged_v_cache_d, TensorView qo_indptr_d, TensorView paged_kv_indptr_d,
+    TensorView paged_kv_indices_d, TensorView paged_kv_last_page_len_d, TensorView o_d,
+    Optional<TensorView> maybe_lse_d, int64_t mask_mode_code_d, int64_t layout_d,
+    int64_t window_left_d, Optional<TensorView> maybe_custom_mask_d,
+    Optional<TensorView> maybe_mask_indptr_d, Optional<TensorView> maybe_alibi_slopes_d,
+    double logits_soft_cap_d, double sm_scale_d, double rope_rcp_scale_d, double rope_rcp_theta_d,
+    bool enable_pdl, TensorView sm_aware_sched) {
+  // Prefill setup
+  PrefillPlanInfo plan_info_p;
+  plan_info_p.FromVector(std::vector<int64_t>(plan_info_vec_p.begin(), plan_info_vec_p.end()));
+  QKVLayout kv_layout_p = static_cast<QKVLayout>(layout_p);
+  int64_t batch_size_p = paged_kv_indptr_p.size(0) - 1;
+  int64_t num_qo_heads = q_p.size(1);
+
+  int64_t num_kv_heads_p, page_size_p;
+  uint32_t head_dim_qk_p = q_p.size(2);
+  if (kv_layout_p == QKVLayout::kHND) {
+    num_kv_heads_p = paged_k_cache_p.size(1);
+    page_size_p = paged_k_cache_p.size(2);
+  } else {
+    page_size_p = paged_k_cache_p.size(1);
+    num_kv_heads_p = paged_k_cache_p.size(2);
+  }
+
+  if (maybe_lse_p.has_value()) {
+    const auto& lse = maybe_lse_p.value();
+    TVM_FFI_ICHECK_EQ(lse.size(0), q_p.size(0));
+    TVM_FFI_ICHECK_EQ(lse.size(1), q_p.size(1));
+  }
+
+  void* float_buffer_ptr_p = static_cast<void*>(float_workspace_buffer_p.data_ptr());
+  void* int_buffer_ptr_p = static_cast<void*>(int_workspace_buffer_p.data_ptr());
+
+  const MaskMode mask_mode_p = static_cast<MaskMode>(mask_mode_code_p);
+
+  // get q_stride_n and q_stride_h
+  const auto q_stride_n_p = q_p.stride(0);
+  const auto q_stride_h_p = q_p.stride(1);
+
+  // get kv_cache_strides
+  const int64_t* kv_cache_strides_p = nullptr;
+  auto k_strides_p = paged_k_cache_p.strides();
+  auto v_strides_p = paged_v_cache_p.strides();
+  TVM_FFI_ICHECK_EQ(k_strides_p.size(), v_strides_p.size());
+  for (int i = 0; i < k_strides_p.size(); ++i) {
+    TVM_FFI_ICHECK_EQ(k_strides_p[i], v_strides_p[i]);
+  }
+  kv_cache_strides_p = k_strides_p.data();
+
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer_p.device().device_id);
+  const cudaStream_t stream = get_stream(float_workspace_buffer_p.device());
+
+  // Decode setup (TensorView decode = batched prefill)
+  PrefillPlanInfo plan_info_d;
+  plan_info_d.FromVector(std::vector<int64_t>(plan_info_vec_d.begin(), plan_info_vec_d.end()));
+  QKVLayout kv_layout_d = static_cast<QKVLayout>(layout_d);
+  int64_t batch_size_d = paged_kv_indptr_d.size(0) - 1;
+  int64_t num_qo_heads_d = q_d.size(1);
+
+  TVM_FFI_ICHECK_EQ(num_qo_heads, num_qo_heads_d)
+      << "POD currently requires same # Query heads for prefill and decode";
+
+  int64_t num_kv_heads_d, page_size_d;
+  uint32_t head_dim_qk_d = q_d.size(2);
+  if (kv_layout_d == QKVLayout::kHND) {
+    num_kv_heads_d = paged_k_cache_d.size(1);
+    page_size_d = paged_k_cache_d.size(2);
+  } else {
+    page_size_d = paged_k_cache_d.size(1);
+    num_kv_heads_d = paged_k_cache_d.size(2);
+  }
+  TVM_FFI_ICHECK_EQ(num_kv_heads_p, num_kv_heads_d)
+      << "POD currently requires same # KV heads for prefill and decode; Prefill: "
+      << num_kv_heads_p << ", Decode: " << num_kv_heads_d;
+
+  if (maybe_lse_d.has_value()) {
+    const auto& lse = maybe_lse_d.value();
+    TVM_FFI_ICHECK_EQ(lse.size(0), q_d.size(0));
+    TVM_FFI_ICHECK_EQ(lse.size(1), q_d.size(1));
+  }
+
+  void* float_buffer_ptr_d = static_cast<void*>(float_workspace_buffer_d.data_ptr());
+  void* int_buffer_ptr_d = static_cast<void*>(int_workspace_buffer_d.data_ptr());
+
+  const MaskMode mask_mode_d = static_cast<MaskMode>(mask_mode_code_d);
+
+  // get q_stride_n and q_stride_h
+  const auto q_stride_n_d = q_d.stride(0);
+  const auto q_stride_h_d = q_d.stride(1);
+
+  // get kv_cache_strides
+  const int64_t* kv_cache_strides_d = nullptr;
+  auto k_strides_d = paged_k_cache_d.strides();
+  auto v_strides_d = paged_v_cache_d.strides();
+  TVM_FFI_ICHECK_EQ(k_strides_d.size(), v_strides_d.size());
+  for (int i = 0; i < k_strides_d.size(); ++i) {
+    TVM_FFI_ICHECK_EQ(k_strides_d[i], v_strides_d[i]);
+  }
+  kv_cache_strides_d = k_strides_d.data();
+
+  // Already handled by prefill
+  // ffi::CUDADeviceGuard device_guard(float_workspace_buffer_d.device().device_id);
+  // const cudaStream_t stream = get_stream(float_workspace_buffer_d.device());
+
+  DISPATCH_context(
+      MASK_MODE_P, MASK_MODE_D, DTypeQ, DTypeKV, HEAD_DIM_QK, USE_SLIDING_WINDOW_P,
+      USE_SLIDING_WINDOW_D, USE_LOGITS_SOFT_CAP, [&] {
+        PrefillParams prefill_params;
+        DTypeO* tmp_v_p = nullptr;
+        float* tmp_s_p = nullptr;
+        {
+          PrefillParams& params = prefill_params;
+          params.q = static_cast<DTypeQ*>(q_p.data_ptr());
+          paged_kv_t<DTypeKV, IdType> paged_kv(
+              num_kv_heads_p, page_size_p, HEAD_DIM_VO, batch_size_p, kv_layout_p,
+              static_cast<DTypeKV*>(paged_k_cache_p.data_ptr()),
+              static_cast<DTypeKV*>(paged_v_cache_p.data_ptr()), kv_cache_strides_p,
+              static_cast<IdType*>(paged_kv_indices_p.data_ptr()),
+              static_cast<IdType*>(paged_kv_indptr_p.data_ptr()),
+              static_cast<IdType*>(paged_kv_last_page_len_p.data_ptr()));
+          params.paged_kv = paged_kv;
+          params.q_indptr = static_cast<IdType*>(qo_indptr_p.data_ptr());
+          params.o = static_cast<DTypeO*>(o_p.data_ptr());
+
+          params.lse = maybe_lse_p.has_value() ? static_cast<float*>(maybe_lse_p.value().data_ptr())
+                                               : nullptr;
+          params.num_qo_heads = num_qo_heads;
+          params.group_size = uint_fastdiv(num_qo_heads / paged_kv.num_heads);
+          params.q_stride_n = q_stride_n_p;
+          params.q_stride_h = q_stride_h_p;
+          params.window_left = window_left_p;
+
+          params.request_indices = nullptr;
+          params.qo_tile_indices = nullptr;
+          params.kv_tile_indices = nullptr;
+          params.merge_indptr = nullptr;
+          params.o_indptr = nullptr;
+          params.kv_chunk_size_ptr = nullptr;
+          params.block_valid_mask = nullptr;
+          params.total_num_rows = nullptr;
+          params.max_total_num_rows = 0;
+          params.padded_batch_size = 0;
+          params.partition_kv = false;
+
+          params.maybe_mask_indptr =
+              maybe_mask_indptr_p.has_value()
+                  ? static_cast<int32_t*>(maybe_mask_indptr_p.value().data_ptr())
+                  : nullptr;
+          params.maybe_alibi_slopes =
+              maybe_alibi_slopes_p.has_value()
+                  ? static_cast<float*>(maybe_alibi_slopes_p.value().data_ptr())
+                  : nullptr;
+          params.logits_soft_cap = logits_soft_cap_p;
+          params.sm_scale = sm_scale_p;
+          params.rope_rcp_scale = rope_rcp_scale_p;
+          params.rope_rcp_theta = rope_rcp_theta_p;
+
+          params.request_indices =
+              GetPtrFromBaseOffset<IdType>(int_buffer_ptr_p, plan_info_p.request_indices_offset);
+          params.qo_tile_indices =
+              GetPtrFromBaseOffset<IdType>(int_buffer_ptr_p, plan_info_p.qo_tile_indices_offset);
+          params.kv_tile_indices =
+              GetPtrFromBaseOffset<IdType>(int_buffer_ptr_p, plan_info_p.kv_tile_indices_offset);
+          params.o_indptr =
+              GetPtrFromBaseOffset<IdType>(int_buffer_ptr_p, plan_info_p.o_indptr_offset);
+          params.kv_chunk_size_ptr =
+              GetPtrFromBaseOffset<IdType>(int_buffer_ptr_p, plan_info_p.kv_chunk_size_ptr_offset);
+          if (plan_info_p.split_kv) {
+            params.merge_indptr =
+                GetPtrFromBaseOffset<IdType>(int_buffer_ptr_p, plan_info_p.merge_indptr_offset);
+            tmp_v_p = GetPtrFromBaseOffset<DTypeO>(float_buffer_ptr_p, plan_info_p.v_offset);
+            tmp_s_p = GetPtrFromBaseOffset<float>(float_buffer_ptr_p, plan_info_p.s_offset);
+            if (plan_info_p.enable_cuda_graph) {
+              params.block_valid_mask =
+                  GetPtrFromBaseOffset<bool>(int_buffer_ptr_p, plan_info_p.block_valid_mask_offset);
+            }
+          }
+          params.padded_batch_size = plan_info_p.padded_batch_size;
+          params.max_total_num_rows = plan_info_p.total_num_rows;
+          if (plan_info_p.enable_cuda_graph) {
+            params.total_num_rows =
+                GetPtrFromBaseOffset<uint32_t>(int_buffer_ptr_p, plan_info_p.total_num_rows_offset);
+          }
+        }
+
+        DecodeParams decode_params;
+        DTypeO* tmp_v_d = nullptr;
+        float* tmp_s_d = nullptr;
+        {
+          DecodeParams& params = decode_params;
+          params.q = static_cast<DTypeQ*>(q_d.data_ptr());
+          paged_kv_t<DTypeKV, IdType> paged_kv(
+              num_kv_heads_d, page_size_d, HEAD_DIM_VO, batch_size_d, kv_layout_d,
+              static_cast<DTypeKV*>(paged_k_cache_d.data_ptr()),
+              static_cast<DTypeKV*>(paged_v_cache_d.data_ptr()), kv_cache_strides_d,
+              static_cast<IdType*>(paged_kv_indices_d.data_ptr()),
+              static_cast<IdType*>(paged_kv_indptr_d.data_ptr()),
+              static_cast<IdType*>(paged_kv_last_page_len_d.data_ptr()));
+          params.paged_kv = paged_kv;
+          params.q_indptr = static_cast<IdType*>(qo_indptr_d.data_ptr());
+          params.o = static_cast<DTypeO*>(o_d.data_ptr());
+
+          params.lse = maybe_lse_d.has_value() ? static_cast<float*>(maybe_lse_d.value().data_ptr())
+                                               : nullptr;
+          params.num_qo_heads = num_qo_heads;
+          params.group_size = uint_fastdiv(num_qo_heads / paged_kv.num_heads);
+          params.q_stride_n = q_stride_n_d;
+          params.q_stride_h = q_stride_h_d;
+          params.window_left = window_left_d;
+
+          params.request_indices = nullptr;
+          params.qo_tile_indices = nullptr;
+          params.kv_tile_indices = nullptr;
+          params.merge_indptr = nullptr;
+          params.o_indptr = nullptr;
+          params.kv_chunk_size_ptr = nullptr;
+          params.block_valid_mask = nullptr;
+          params.total_num_rows = nullptr;
+          params.max_total_num_rows = 0;
+          params.padded_batch_size = 0;
+          params.partition_kv = false;
+
+          params.maybe_mask_indptr =
+              maybe_mask_indptr_d.has_value()
+                  ? static_cast<int32_t*>(maybe_mask_indptr_d.value().data_ptr())
+                  : nullptr;
+          params.maybe_alibi_slopes =
+              maybe_alibi_slopes_d.has_value()
+                  ? static_cast<float*>(maybe_alibi_slopes_d.value().data_ptr())
+                  : nullptr;
+          params.logits_soft_cap = logits_soft_cap_d;
+          params.sm_scale = sm_scale_d;
+          params.rope_rcp_scale = rope_rcp_scale_d;
+          params.rope_rcp_theta = rope_rcp_theta_d;
+
+          params.request_indices =
+              GetPtrFromBaseOffset<IdType>(int_buffer_ptr_d, plan_info_d.request_indices_offset);
+          params.qo_tile_indices =
+              GetPtrFromBaseOffset<IdType>(int_buffer_ptr_d, plan_info_d.qo_tile_indices_offset);
+          params.kv_tile_indices =
+              GetPtrFromBaseOffset<IdType>(int_buffer_ptr_d, plan_info_d.kv_tile_indices_offset);
+          params.o_indptr =
+              GetPtrFromBaseOffset<IdType>(int_buffer_ptr_d, plan_info_d.o_indptr_offset);
+          params.kv_chunk_size_ptr =
+              GetPtrFromBaseOffset<IdType>(int_buffer_ptr_d, plan_info_d.kv_chunk_size_ptr_offset);
+          if (plan_info_d.split_kv) {
+            params.merge_indptr =
+                GetPtrFromBaseOffset<IdType>(int_buffer_ptr_d, plan_info_d.merge_indptr_offset);
+            tmp_v_d = GetPtrFromBaseOffset<DTypeO>(float_buffer_ptr_d, plan_info_d.v_offset);
+            tmp_s_d = GetPtrFromBaseOffset<float>(float_buffer_ptr_d, plan_info_d.s_offset);
+            if (plan_info_d.enable_cuda_graph) {
+              params.block_valid_mask =
+                  GetPtrFromBaseOffset<bool>(int_buffer_ptr_d, plan_info_d.block_valid_mask_offset);
+            }
+          }
+          params.padded_batch_size = plan_info_d.padded_batch_size;
+          params.max_total_num_rows = plan_info_d.total_num_rows;
+          if (plan_info_d.enable_cuda_graph) {
+            params.total_num_rows =
+                GetPtrFromBaseOffset<uint32_t>(int_buffer_ptr_d, plan_info_d.total_num_rows_offset);
+          }
+        }
+
+        constexpr bool use_custom_mask_p = MASK_MODE_P == MaskMode::kCustom;
+        using PrefillAttentionVariant =
+            DefaultAttention</*use_custom_mask=*/use_custom_mask_p, USE_SLIDING_WINDOW_P,
+                             USE_LOGITS_SOFT_CAP, /*use_alibi_bias=*/false>;
+        constexpr bool use_custom_mask_d = MASK_MODE_D == MaskMode::kCustom;
+        using DecodeAttentionVariant =
+            DefaultAttention</*use_custom_mask=*/use_custom_mask_d, USE_SLIDING_WINDOW_D,
+                             USE_LOGITS_SOFT_CAP, /*use_alibi_bias=*/false>;
+
+        int dev_id = 0;
+        FLASHINFER_CUDA_CALL(cudaGetDevice(&dev_id));
+        int num_sm = 0;
+        FLASHINFER_CUDA_CALL(
+            cudaDeviceGetAttribute(&num_sm, cudaDevAttrMultiProcessorCount, dev_id));
+        // SM-aware scheduling buffer uses num_sm + 2 entries
+        // num_sm entries for counters for each SM, and
+        // 2 entries for keeping track of blockIds for prefill and decode
+        assert(
+            sm_aware_sched.ndim() == 1 && sm_aware_sched.size(0) == num_sm + 2 &&
+            "sm_aware_sched tensor has incorrect shape or type, should be (num_sm + 2,) of int32");
+        DISPATCH_CTA_TILE_Q(plan_info_p.cta_tile_q, CTA_TILE_Q_P, {
+          constexpr size_t CTA_TILE_Q_D = 16;
+          cudaError_t status = flashinfer::BatchPODWithKVCacheTensorDispatched<
+              HEAD_DIM_QK, HEAD_DIM_VO, POS_ENCODING_MODE, USE_FP16_QK_REDUCTION, CTA_TILE_Q_P,
+              MASK_MODE_P, CTA_TILE_Q_D, MASK_MODE_D, PrefillAttentionVariant,
+              DecodeAttentionVariant>(prefill_params, tmp_v_p, tmp_s_p, decode_params, tmp_v_d,
+                                      tmp_s_d, enable_pdl, stream,
+                                      static_cast<int*>(sm_aware_sched.data_ptr()));
+          TVM_FFI_ICHECK(status == cudaSuccess)
+              << "BatchPODWithKVCache kernel launch failed, error: " << cudaGetErrorString(status);
+          return status;
+        });
+      });
+}
diff --git a/csrc/batch_pod_customize_config.jinja b/csrc/batch_pod_customize_config.jinja
new file mode 100644
index 0000000000..9f27b42953
--- /dev/null
+++ b/csrc/batch_pod_customize_config.jinja
@@ -0,0 +1,43 @@
+#pragma once
+#include <flashinfer/page.cuh>
+#include <flashinfer/math.cuh>
+#include <flashinfer/layout.cuh>
+#include <flashinfer/utils.cuh>
+#include <flashinfer/pos_enc.cuh>
+#include <flashinfer/fastdiv.cuh>
+#include <flashinfer/attention/scheduler.cuh>
+#include <flashinfer/attention/mask.cuh>
+#include <flashinfer/attention/variant_helper.cuh>
+#include <flashinfer/attention/default_prefill_params.cuh>
+
+using namespace flashinfer;
+
+using DTypeQ = {{ dtype_q }};
+using DTypeKV = {{ dtype_kv }};
+using DTypeO = {{ dtype_o }};
+using IdType = {{ idtype }};
+constexpr int HEAD_DIM_QK = {{ head_dim_qk }};
+constexpr int HEAD_DIM_VO = {{ head_dim_vo }};
+constexpr bool USE_FP16_QK_REDUCTION = {{ use_fp16_qk_reduction }};
+
+constexpr auto USE_LOGITS_SOFT_CAP_P = {{ use_logits_soft_cap_p }};
+constexpr auto POS_ENCODING_MODE_P = {{ pos_encoding_mode_p }};
+constexpr auto USE_SLIDING_WINDOW_P = {{ use_sliding_window_p }};
+
+constexpr auto USE_LOGITS_SOFT_CAP_D = {{ use_logits_soft_cap_d }};
+constexpr auto POS_ENCODING_MODE_D = {{ pos_encoding_mode_d }};
+constexpr auto USE_SLIDING_WINDOW_D = {{ use_sliding_window_d }};
+
+constexpr auto POS_ENCODING_MODE = PosEncodingMode::kNone;
+constexpr bool USE_LOGITS_SOFT_CAP = false;
+
+using PrefillParams = BatchPrefillPagedParams<DTypeQ, DTypeKV, DTypeO, IdType>;
+using DecodeParams = BatchPrefillPagedParams<DTypeQ, DTypeKV, DTypeO, IdType>;
+
+#define DISPATCH_context(MASK_MODE_P, MASK_MODE_D, DTypeQ, DTypeKV, HEAD_DIM_QK,    \
+            USE_SLIDING_WINDOW_P, USE_SLIDING_WINDOW_D, USE_LOGITS_SOFT_CAP, ...)   \
+  DISPATCH_MASK_MODE(mask_mode_p, MASK_MODE_P, {                                    \
+    DISPATCH_MASK_MODE(mask_mode_d, MASK_MODE_D, {                                  \
+      __VA_ARGS__();                                                                \
+    });                                                                             \
+});
diff --git a/csrc/batch_pod_jit_binding.cu b/csrc/batch_pod_jit_binding.cu
new file mode 100644
index 0000000000..c7a8a5ea6b
--- /dev/null
+++ b/csrc/batch_pod_jit_binding.cu
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2023-2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "batch_pod_config.inc"
+#include "tvm_ffi_utils.h"
+
+using tvm::ffi::Array;
+using tvm::ffi::Optional;
+
+void batch_pod_with_kv_cache_tensor(
+    // Prefill params
+    TensorView float_workspace_buffer_p, TensorView int_workspace_buffer_p,
+    Array<int64_t> plan_info_vec_p, TensorView q_p, TensorView paged_k_cache_p,
+    TensorView paged_v_cache_p, TensorView qo_indptr_p, TensorView paged_kv_indptr_p,
+    TensorView paged_kv_indices_p, TensorView paged_kv_last_page_len_p, TensorView o_p,
+    Optional<TensorView> maybe_lse_p, int64_t mask_mode_code_p, int64_t layout_p,
+    int64_t window_left_p, Optional<TensorView> maybe_custom_mask_p,
+    Optional<TensorView> maybe_mask_indptr_p, Optional<TensorView> maybe_alibi_slopes_p,
+    double logits_soft_cap_p, double sm_scale_p, double rope_rcp_scale_p, double rope_rcp_theta_p,
+    // Decode params
+    TensorView float_workspace_buffer_d, TensorView int_workspace_buffer_d,
+    Array<int64_t> plan_info_vec_d, TensorView q_d, TensorView paged_k_cache_d,
+    TensorView paged_v_cache_d, TensorView qo_indptr_d, TensorView paged_kv_indptr_d,
+    TensorView paged_kv_indices_d, TensorView paged_kv_last_page_len_d, TensorView o_d,
+    Optional<TensorView> maybe_lse_d, int64_t mask_mode_code_d, int64_t layout_d,
+    int64_t window_left_d, Optional<TensorView> maybe_custom_mask_d,
+    Optional<TensorView> maybe_mask_indptr_d, Optional<TensorView> maybe_alibi_slopes_d,
+    double logits_soft_cap_d, double sm_scale_d, double rope_rcp_scale_d, double rope_rcp_theta_d,
+    bool enable_pdl, TensorView sm_aware_sched);
+
+// Batch-request prefill attention with KV-Cache operator
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(batch_pod_with_kv_cache_tensor, batch_pod_with_kv_cache_tensor);
diff --git a/csrc/batch_pod_kernel_inst.jinja b/csrc/batch_pod_kernel_inst.jinja
new file mode 100644
index 0000000000..cb2c39d32b
--- /dev/null
+++ b/csrc/batch_pod_kernel_inst.jinja
@@ -0,0 +1,31 @@
+#include <flashinfer/attention/default_prefill_params.cuh>
+#include <flashinfer/attention/default_decode_params.cuh>
+#include <flashinfer/attention/variants.cuh>
+#include <flashinfer/attention/scheduler.cuh>
+#include <flashinfer/attention/mask.cuh>
+#include <flashinfer/attention/batch_pod.cuh>
+#include <flashinfer/pos_enc.cuh>
+#include <flashinfer/utils.cuh>
+#include <flashinfer/page.cuh>
+
+#include "batch_pod_config.inc"
+
+using namespace flashinfer;
+
+namespace flashinfer {
+constexpr auto use_custom_mask_p = {{ mask_mode_p }} == MaskMode::kCustom;
+constexpr auto use_custom_mask_d = {{ mask_mode_d }} == MaskMode::kCustom;
+// Not sure about the below declaration
+constexpr auto POS_ENCODING_MODE = PosEncodingMode::kNone;
+
+{% for cta_tile_q in [16, 64, 128] %}
+template cudaError_t BatchPODWithKVCacheTensorDispatched<
+    {{ head_dim_qk }}, {{ head_dim_vo }}, POS_ENCODING_MODE,
+    {{ use_fp16_qk_reduction }}, /*CTA_TILE_Q_P=*/{{cta_tile_q}}, {{ mask_mode_p }},
+    /*CTA_TILE_Q_D=*/16, {{ mask_mode_d }}, {{ variant_name_p }},
+    {{ variant_name_d }}, PrefillParams, DecodeParams>(
+            PrefillParams prefill_params, {{ dtype_o }}* tmp_v_p, float *tmp_s_p,
+            DecodeParams decode_params, {{ dtype_o }}* tmp_v_d, float *tmp_s_d,
+            bool enable_pdl, cudaStream_t stream, int* sm_aware_sched);
+{% endfor %}
+}
diff --git a/csrc/batch_prefill.cu b/csrc/batch_prefill.cu
index 5d7182bdc5..6011ba2063 100644
--- a/csrc/batch_prefill.cu
+++ b/csrc/batch_prefill.cu
@@ -50,7 +50,7 @@ Array<int64_t> BatchPrefillWithKVCachePlan(
     TensorView kv_len_arr, int64_t total_num_rows, int64_t batch_size, int64_t num_qo_heads,
     int64_t num_kv_heads, int64_t page_size, bool enable_cuda_graph, int64_t head_dim_qk,
     int64_t head_dim_vo, bool causal, int64_t window_left, int64_t fixed_split_size,
-    bool disable_split_kv) {
+    bool disable_split_kv, int64_t num_colocated_ctas = 0) {
   size_t float_workspace_size_in_bytes =
       float_workspace_buffer.size(0) * get_element_size(float_workspace_buffer);
   size_t int_workspace_size_in_bytes =
@@ -58,7 +58,7 @@ Array<int64_t> BatchPrefillWithKVCachePlan(
 
   PrefillPlanInfo plan_info;
 
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
   cudaError_t status = PrefillPlan<IdType>(
       float_workspace_buffer.data_ptr(), float_workspace_size_in_bytes,
@@ -66,7 +66,8 @@ Array<int64_t> BatchPrefillWithKVCachePlan(
       int_workspace_size_in_bytes, plan_info, static_cast<IdType*>(qo_indptr.data_ptr()),
       static_cast<IdType*>(kv_indptr.data_ptr()), total_num_rows, batch_size, num_qo_heads,
       num_kv_heads, head_dim_qk, head_dim_vo, page_size, enable_cuda_graph,
-      /*sizeof_dtype_o=*/2, window_left, fixed_split_size, disable_split_kv, stream);
+      /*sizeof_dtype_o=*/2, window_left, fixed_split_size, disable_split_kv, num_colocated_ctas,
+      stream);
 
   TVM_FFI_ICHECK(status == cudaSuccess)
       << "Failed to plan prefill with error: " << cudaGetErrorString(status);
@@ -113,7 +114,7 @@ void BatchPrefillWithRaggedKVCacheRun(TensorView float_workspace_buffer,
 
   const MaskMode mask_mode = static_cast<MaskMode>(mask_mode_code);
 
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
 
   DISPATCH_context(
@@ -246,7 +247,7 @@ void BatchPrefillWithPagedKVCacheRun(TensorView float_workspace_buffer,
         << "k/v strides differs at " << i;
   }
 
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
 
   DISPATCH_context(
diff --git a/csrc/batch_prefill_fp8_ragged_sm90_kernel_inst.jinja b/csrc/batch_prefill_fp8_ragged_sm90_kernel_inst.jinja
index 8225edbb00..0b615e57e8 100644
--- a/csrc/batch_prefill_fp8_ragged_sm90_kernel_inst.jinja
+++ b/csrc/batch_prefill_fp8_ragged_sm90_kernel_inst.jinja
@@ -1 +1,15 @@
-// TODO: Not implemented yet
+#include <flashinfer/attention/hopper/quantization/prefill_sm90.cuh>
+#include "batch_prefill_sm90_config.inc"
+
+namespace flashinfer {
+
+{% for same_scheduler_for_all_heads in ["true", "false"] %}
+template cudaError_t BatchFP8PrefillWithRaggedKVCacheDispatched
+    <{{ head_dim_qk }},
+     {{ mask_mode }},
+     /*USE_SLIDING_WINDOW=*/{{ use_sliding_window }},
+     /*SAME_SCHEDULER_FOR_ALL_HEADS=*/{{ same_scheduler_for_all_heads }},
+     {{ variant_name }}, RaggedParams>(RaggedParams& params, bool enable_pdl, cudaStream_t stream);
+{% endfor %}
+
+}  // namespace flashinfer
diff --git a/csrc/batch_prefill_fp8_sm90.cu b/csrc/batch_prefill_fp8_sm90.cu
index 7c8680dc0b..6bf67c9928 100644
--- a/csrc/batch_prefill_fp8_sm90.cu
+++ b/csrc/batch_prefill_fp8_sm90.cu
@@ -29,6 +29,11 @@ template <uint32_t HEAD_DIM, MaskMode MASK_MODE, bool LEFT_SLIDING_WINDOW,
 cudaError_t BatchFP8PrefillWithPagedKVCacheDispatched(Params& params, bool enable_pdl,
                                                       cudaStream_t stream);
 
+template <uint32_t HEAD_DIM, MaskMode MASK_MODE, bool LEFT_SLIDING_WINDOW,
+          bool SAME_SCHEDULE_FOR_ALL_HEADS, typename AttentionVariant, typename Params>
+cudaError_t BatchFP8PrefillWithRaggedKVCacheDispatched(Params& params, bool enable_pdl,
+                                                       cudaStream_t stream);
+
 }  // namespace flashinfer
 
 using namespace flashinfer;
@@ -50,7 +55,7 @@ Array<int64_t> BatchPrefillWithKVCacheSM90Plan(
 
   flashinfer::PrefillPlanSM90Info plan_info;
 
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
 
   cudaError_t status = PrefillSM90Plan(
@@ -78,7 +83,94 @@ void BatchPrefillWithRaggedKVCacheSM90Run(ffi::TensorView float_workspace_buffer
                                           int64_t window_left,
                                           bool enable_pdl  // placeholder
                                               ADDITIONAL_FUNC_PARAMS) {
-  return;  // TODO: Implement this function
+  PrefillPlanSM90Info plan_info;
+  plan_info.FromVector(std::vector<int64_t>(plan_info_vec.begin(), plan_info_vec.end()));
+
+  if (maybe_lse.has_value()) {
+    const auto& lse = maybe_lse.value();
+    TVM_FFI_ICHECK_EQ(lse.size(0), q.size(0));
+    TVM_FFI_ICHECK_EQ(lse.size(1), q.size(1));
+  }
+
+  void* float_buffer_ptr = float_workspace_buffer.data_ptr();
+  void* int_buffer_ptr = int_workspace_buffer.data_ptr();
+
+  int64_t head_dim_qk = q.size(2);
+  int64_t head_dim_vo = v.size(2);
+
+  QKVLayout kv_layout = static_cast<QKVLayout>(layout);
+
+  cudaSetDevice(float_workspace_buffer.device().device_id);
+  const cudaStream_t stream = get_stream(float_workspace_buffer.device());
+  const MaskMode mask_mode = static_cast<MaskMode>(mask_mode_code);
+  bool use_swa = window_left != -1;
+
+  DISPATCH_context(
+      DTypeQ, DTypeKV, DTypeO, IdType, MASK_MODE, HEAD_DIM_QK, HEAD_DIM_VO, USE_SLIDING_WINDOW,
+      USE_LOGITS_SOFT_CAP, AttentionVariant, RaggedParams, PagedParams, [&] {
+        RaggedParams params;
+
+        params.q_ptr = static_cast<DTypeQ*>(q.data_ptr());
+        params.k_ptr = static_cast<DTypeKV*>(k.data_ptr());
+        params.v_ptr = static_cast<DTypeKV*>(v.data_ptr());
+        params.o_ptr = static_cast<DTypeO*>(o.data_ptr());
+        params.lse_ptr = maybe_lse ? static_cast<float*>(maybe_lse.value().data_ptr()) : nullptr;
+        params.q_stride_n = q.stride(0);
+        params.q_stride_h = q.stride(1);
+        params.o_stride_n = o.stride(0);
+        params.o_stride_h = o.stride(1);
+        if (kv_layout == QKVLayout::kNHD) {
+          params.k_stride_n = k.stride(0);
+          params.k_stride_h = k.stride(1);
+          params.v_stride_n = v.stride(0);
+          params.v_stride_h = v.stride(1);
+        } else {
+          params.k_stride_h = k.stride(0);
+          params.k_stride_n = k.stride(1);
+          params.v_stride_h = v.stride(0);
+          params.v_stride_n = v.stride(1);
+        }
+        params.nnz_qo = q.size(0);
+        params.nnz_kv = k.size(0);
+        params.num_qo_heads = q.size(1);
+        params.num_kv_heads = k.size(1);
+        params.group_size = params.num_qo_heads / params.num_kv_heads;
+        params.window_left = window_left;
+        params.causal = mask_mode_code == 1;
+        params.qo_tile_indices =
+            GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.qo_tile_indices_offset);
+        params.qo_indptr = GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.qo_indptr_offset);
+        params.kv_indptr = GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.kv_indptr_offset);
+        params.qo_lens = GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.qo_len_offset);
+        params.kv_lens = GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.kv_len_offset);
+        params.head_indices =
+            GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.head_indices_offset);
+        params.work_indptr =
+            GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.work_indptr_offset);
+        params.batch_indices =
+            GetPtrFromBaseOffset<IdType>(int_buffer_ptr, plan_info.batch_indices_offset);
+
+        ADDITIONAL_PARAMS_SETTER
+
+        // Not support various head_dim for now
+        static_assert(HEAD_DIM_QK == HEAD_DIM_VO, "head_dim_qk and head_dim_vo should be the same");
+        // Currently only support same quantization precision
+        static_assert(std::is_same_v<DTypeQ, DTypeKV>);
+
+        bool same_schedule_for_all_heads = plan_info.same_schedule_for_all_heads;
+        DISPATCH_BOOL(same_schedule_for_all_heads, SAME_SCHEDULER_FOR_ALL_HEADS, [&] {
+          cudaError_t status =
+              BatchFP8PrefillWithRaggedKVCacheDispatched<HEAD_DIM_QK, MASK_MODE, USE_SLIDING_WINDOW,
+                                                         SAME_SCHEDULER_FOR_ALL_HEADS,
+                                                         AttentionVariant>(params, enable_pdl,
+                                                                           stream);
+
+          TVM_FFI_ICHECK(status == cudaSuccess)
+              << "BatchPrefillWithRaggedKVCacheSM90Run failed with error: "
+              << cudaGetErrorString(status);
+          return true;
+        });
+      });
 }
 
 void BatchPrefillWithPagedKVCacheSM90Run(
@@ -111,7 +203,7 @@ void BatchPrefillWithPagedKVCacheSM90Run(
   void* float_buffer_ptr = float_workspace_buffer.data_ptr();
   void* int_buffer_ptr = int_workspace_buffer.data_ptr();
 
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
   const MaskMode mask_mode = static_cast<MaskMode>(mask_mode_code);
   bool use_swa = window_left != -1;
@@ -136,12 +228,18 @@ void BatchPrefillWithPagedKVCacheSM90Run(
           params.k_stride_h = paged_k_cache.stride(2);
           params.v_stride_n = paged_v_cache.stride(1);
           params.v_stride_h = paged_v_cache.stride(2);
+          // For sparse paged KV cache, store the stride between pages
+          params.k_page_stride = paged_k_cache.stride(0);
+          params.v_page_stride = paged_v_cache.stride(0);
         } else {
           // (num_pages, num_heads, page_size, head_dim)
           params.k_stride_h = paged_k_cache.stride(1);
           params.k_stride_n = paged_k_cache.stride(2);
           params.v_stride_h = paged_v_cache.stride(1);
           params.v_stride_n = paged_v_cache.stride(2);
+          // For sparse paged KV cache, store the stride between pages
+          params.k_page_stride = paged_k_cache.stride(0);
+          params.v_page_stride = paged_v_cache.stride(0);
         }
         params.nnz_qo = q.size(0);
         params.num_qo_heads = q.size(1);
diff --git a/csrc/batch_prefill_jit_binding.cu b/csrc/batch_prefill_jit_binding.cu
index da1e1981dc..3dda0f115a 100644
--- a/csrc/batch_prefill_jit_binding.cu
+++ b/csrc/batch_prefill_jit_binding.cu
@@ -25,7 +25,7 @@ Array<int64_t> BatchPrefillWithKVCachePlan(
     TensorView kv_len_arr, int64_t total_num_rows, int64_t batch_size, int64_t num_qo_heads,
     int64_t num_kv_heads, int64_t page_size, bool enable_cuda_graph, int64_t head_dim_qk,
     int64_t head_dim_vo, bool causal, int64_t window_left, int64_t fixed_split_size,
-    bool disable_split_kv);
+    bool disable_split_kv, int64_t num_colocated_ctas);
 
 void BatchPrefillWithRaggedKVCacheRun(TensorView float_workspace_buffer,
                                       TensorView int_workspace_buffer, Array<int64_t> plan_info_vec,
diff --git a/csrc/batch_prefill_sm90.cu b/csrc/batch_prefill_sm90.cu
index 1cf78bab59..564ed6b08c 100644
--- a/csrc/batch_prefill_sm90.cu
+++ b/csrc/batch_prefill_sm90.cu
@@ -56,7 +56,7 @@ Array<int64_t> BatchPrefillWithKVCacheSM90Plan(
 
   flashinfer::PrefillPlanSM90Info plan_info;
 
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
 
   cudaError_t status = PrefillSM90Plan(
@@ -97,7 +97,7 @@ void BatchPrefillWithRaggedKVCacheSM90Run(
 
   QKVLayout kv_layout = static_cast<QKVLayout>(layout);
 
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
   const MaskMode mask_mode = static_cast<MaskMode>(mask_mode_code);
   bool use_swa = window_left != -1;
@@ -193,7 +193,7 @@ void BatchPrefillWithPagedKVCacheSM90Run(
   void* float_buffer_ptr = float_workspace_buffer.data_ptr();
   void* int_buffer_ptr = int_workspace_buffer.data_ptr();
 
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
   const MaskMode mask_mode = static_cast<MaskMode>(mask_mode_code);
   bool use_swa = window_left != -1;
@@ -218,13 +218,24 @@ void BatchPrefillWithPagedKVCacheSM90Run(
           params.k_stride_h = paged_k_cache.stride(2);
           params.v_stride_n = paged_v_cache.stride(1);
           params.v_stride_h = paged_v_cache.stride(2);
+          // For sparse paged KV cache, store the stride between pages
+          params.k_page_stride = paged_k_cache.stride(0);
+          params.v_page_stride = paged_v_cache.stride(0);
         } else {
           // (num_pages, num_heads, page_size, head_dim)
           params.k_stride_h = paged_k_cache.stride(1);
           params.k_stride_n = paged_k_cache.stride(2);
           params.v_stride_h = paged_v_cache.stride(1);
           params.v_stride_n = paged_v_cache.stride(2);
+          // For sparse paged KV cache, store the stride between pages
+          params.k_page_stride = paged_k_cache.stride(0);
+          params.v_page_stride = paged_v_cache.stride(0);
         }
+        // Sparse mainloop assumes K and V have same strides for efficiency
+        TVM_FFI_ICHECK_EQ(params.k_page_stride, params.v_page_stride)
+            << "K and V must have same page stride for sparse attention";
+        TVM_FFI_ICHECK_EQ(params.k_stride_n, params.v_stride_n)
+            << "K and V must have same stride_n for sparse attention";
         params.nnz_qo = q.size(0);
         params.num_qo_heads = q.size(1);
         params.num_kv_heads = num_kv_heads;
diff --git a/csrc/batch_prefill_sm90_customize_config.jinja b/csrc/batch_prefill_sm90_customize_config.jinja
index b37ecac60d..640637c7df 100644
--- a/csrc/batch_prefill_sm90_customize_config.jinja
+++ b/csrc/batch_prefill_sm90_customize_config.jinja
@@ -104,6 +104,11 @@ struct PagedParams {
   int64_t o_stride_h;
   int64_t nnz_qo;
 
+  // NOTE: For sparse paged KV cache, we need the stride between pages
+  // This is paged_k_cache.stride(0), not the layout stride
+  int64_t k_page_stride;  // Stride between pages for K
+  int64_t v_page_stride;  // Stride between pages for V
+
   int head_dim;
   int num_qo_heads;
   int num_kv_heads;
diff --git a/csrc/blackwell_fmha_plan.cu b/csrc/blackwell_fmha_plan.cu
index ef9b1475ea..e20b98179e 100644
--- a/csrc/blackwell_fmha_plan.cu
+++ b/csrc/blackwell_fmha_plan.cu
@@ -21,7 +21,7 @@ void blackwell_fmha_plan(TensorView qo_segment_offsets, TensorView kv_segment_of
                          TensorView work_indptr, TensorView qo_tile_indices,
                          TensorView head_indices, TensorView batch_indices, int64_t qo_tile_size,
                          int64_t num_heads, int64_t num_buckets, bool causal) {
-  cudaSetDevice(qo_segment_offsets.device().device_id);
+  ffi::CUDADeviceGuard device_guard(qo_segment_offsets.device().device_id);
   const cudaStream_t stream = get_stream(qo_tile_indices.device());
   int batch_size = qo_segment_offsets.size(0) - 1;
 
diff --git a/csrc/bmm_fp8.cu b/csrc/bmm_fp8.cu
index ea8417b617..4de464fac0 100644
--- a/csrc/bmm_fp8.cu
+++ b/csrc/bmm_fp8.cu
@@ -45,7 +45,7 @@ void bmm_fp8(TensorView A, TensorView B, TensorView D, TensorView A_scale, Tenso
         auto n = B.size(2);
 
         auto lt_handle = reinterpret_cast<cublasLtHandle_t>(cublas_handle);
-        cudaSetDevice(A.device().device_id);
+        ffi::CUDADeviceGuard device_guard(A.device().device_id);
         auto stream = get_stream(A.device());
 
         auto status = flashinfer::bmm_fp8::bmm_fp8_internal_cublaslt(
diff --git a/csrc/cascade.cu b/csrc/cascade.cu
index 98e4a590dc..4c3a64e95b 100644
--- a/csrc/cascade.cu
+++ b/csrc/cascade.cu
@@ -41,7 +41,7 @@ void merge_state(TensorView v_a, TensorView s_a, TensorView v_b, TensorView s_b,
   unsigned int num_heads = v_a.size(1);
   unsigned int head_dim = v_a.size(2);
 
-  cudaSetDevice(v_a.device().device_id);
+  ffi::CUDADeviceGuard device_guard(v_a.device().device_id);
   auto stream = get_stream(v_a.device());
 
   bool success = DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(v_a.dtype(), c_type, [&] {
@@ -85,7 +85,7 @@ void merge_state_in_place(TensorView v, TensorView s, TensorView v_other, Tensor
   unsigned int num_heads = v.size(1);
   unsigned int head_dim = v.size(2);
 
-  cudaSetDevice(v.device().device_id);
+  ffi::CUDADeviceGuard device_guard(v.device().device_id);
   auto stream = get_stream(v.device());
   bool success = DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(v.dtype(), c_type, [&] {
     cudaError_t status = MergeStateInPlace(
@@ -114,7 +114,7 @@ void merge_states(TensorView v, TensorView s, TensorView v_merged, TensorView s_
   unsigned int num_heads = v.size(2);
   unsigned int head_dim = v.size(3);
 
-  cudaSetDevice(v.device().device_id);
+  ffi::CUDADeviceGuard device_guard(v.device().device_id);
   auto stream = get_stream(v.device());
   bool success = DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(v.dtype(), c_type, [&] {
     cudaError_t status = MergeStates(
diff --git a/csrc/cutlass_mla.cu b/csrc/cutlass_mla.cu
index f68df30bea..4700fe5f96 100644
--- a/csrc/cutlass_mla.cu
+++ b/csrc/cutlass_mla.cu
@@ -23,7 +23,7 @@ using namespace flashinfer::attention;
 void CutlassMLAPagedAttention(ffi::TensorView workspace, ffi::TensorView out, ffi::TensorView lse,
                               ffi::TensorView q_nope_pe, ffi::TensorView ckv_kpe_cache,
                               ffi::TensorView kv_lens, ffi::TensorView page_table) {
-  cudaSetDevice(q_nope_pe.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q_nope_pe.device().device_id);
   const cudaStream_t stream = get_stream(q_nope_pe.device());
 
   int device_index = q_nope_pe.device().device_id;
diff --git a/csrc/dsv3_router_gemm.cu b/csrc/dsv3_router_gemm.cu
new file mode 100644
index 0000000000..2d44147d97
--- /dev/null
+++ b/csrc/dsv3_router_gemm.cu
@@ -0,0 +1,152 @@
+#include "flashinfer/gemm/dsv3_router_gemm.cuh"
+#include "tvm_ffi_utils.h"
+
+namespace flashinfer::trtllm_dsv3_router_gemm {
+template <typename T, int kNumTokens, int kNumExperts, int kHiddenDim>
+void invokeRouterGemm(float* output, T const* mat_a, T const* mat_b, cudaStream_t stream,
+                      bool use_pdl = false) {
+  constexpr int VPT = 16 / sizeof(T);
+  constexpr int kBlockSize = 128;
+  cudaLaunchConfig_t config;
+  config.gridDim = kNumExperts;
+  config.blockDim = kBlockSize;
+  config.dynamicSmemBytes = 0;
+  config.stream = stream;
+  cudaLaunchAttribute attrs[1];
+  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attrs[0].val.programmaticStreamSerializationAllowed = use_pdl;
+  config.numAttrs = 1;
+  config.attrs = attrs;
+  auto status = cudaLaunchKernelEx(
+      &config, router_gemm_kernel<T, kBlockSize, VPT, kNumTokens, kNumExperts, kHiddenDim>, output,
+      mat_a, mat_b);
+  TVM_FFI_ICHECK(status == cudaSuccess)
+      << "cudaLaunchKernelEx failed with error code " << cudaGetErrorString(status);
+}
+
+template void invokeRouterGemm<__nv_bfloat16, 1, 256, 7168>(float*, __nv_bfloat16 const*,
+                                                            __nv_bfloat16 const*, cudaStream_t,
+                                                            bool);
+
+template void invokeRouterGemm<__nv_bfloat16, 2, 256, 7168>(float*, __nv_bfloat16 const*,
+                                                            __nv_bfloat16 const*, cudaStream_t,
+                                                            bool);
+
+template void invokeRouterGemm<__nv_bfloat16, 3, 256, 7168>(float*, __nv_bfloat16 const*,
+                                                            __nv_bfloat16 const*, cudaStream_t,
+                                                            bool);
+
+template void invokeRouterGemm<__nv_bfloat16, 4, 256, 7168>(float*, __nv_bfloat16 const*,
+                                                            __nv_bfloat16 const*, cudaStream_t,
+                                                            bool);
+
+template void invokeRouterGemm<__nv_bfloat16, 5, 256, 7168>(float*, __nv_bfloat16 const*,
+                                                            __nv_bfloat16 const*, cudaStream_t,
+                                                            bool);
+
+template void invokeRouterGemm<__nv_bfloat16, 6, 256, 7168>(float*, __nv_bfloat16 const*,
+                                                            __nv_bfloat16 const*, cudaStream_t,
+                                                            bool);
+
+template void invokeRouterGemm<__nv_bfloat16, 7, 256, 7168>(float*, __nv_bfloat16 const*,
+                                                            __nv_bfloat16 const*, cudaStream_t,
+                                                            bool);
+
+template void invokeRouterGemm<__nv_bfloat16, 8, 256, 7168>(float*, __nv_bfloat16 const*,
+                                                            __nv_bfloat16 const*, cudaStream_t,
+                                                            bool);
+
+template void invokeRouterGemm<__nv_bfloat16, 9, 256, 7168>(float*, __nv_bfloat16 const*,
+                                                            __nv_bfloat16 const*, cudaStream_t,
+                                                            bool);
+
+template void invokeRouterGemm<__nv_bfloat16, 10, 256, 7168>(float*, __nv_bfloat16 const*,
+                                                             __nv_bfloat16 const*, cudaStream_t,
+                                                             bool);
+
+template void invokeRouterGemm<__nv_bfloat16, 11, 256, 7168>(float*, __nv_bfloat16 const*,
+                                                             __nv_bfloat16 const*, cudaStream_t,
+                                                             bool);
+
+template void invokeRouterGemm<__nv_bfloat16, 12, 256, 7168>(float*, __nv_bfloat16 const*,
+                                                             __nv_bfloat16 const*, cudaStream_t,
+                                                             bool);
+
+template void invokeRouterGemm<__nv_bfloat16, 13, 256, 7168>(float*, __nv_bfloat16 const*,
+                                                             __nv_bfloat16 const*, cudaStream_t,
+                                                             bool);
+
+template void invokeRouterGemm<__nv_bfloat16, 14, 256, 7168>(float*, __nv_bfloat16 const*,
+                                                             __nv_bfloat16 const*, cudaStream_t,
+                                                             bool);
+
+template void invokeRouterGemm<__nv_bfloat16, 15, 256, 7168>(float*, __nv_bfloat16 const*,
+                                                             __nv_bfloat16 const*, cudaStream_t,
+                                                             bool);
+
+template void invokeRouterGemm<__nv_bfloat16, 16, 256, 7168>(float*, __nv_bfloat16 const*,
+                                                             __nv_bfloat16 const*, cudaStream_t,
+                                                             bool);
+
+template <int kBegin, int kEnd, int kNumExperts, int kHiddenDim>
+struct LoopUnroller {
+  static void unroll(int num_tokens, float* output, __nv_bfloat16 const* input,
+                     __nv_bfloat16 const* weights, cudaStream_t stream, bool launch_with_pdl) {
+    if (num_tokens == kBegin) {
+      invokeRouterGemm<__nv_bfloat16, kBegin, kNumExperts, kHiddenDim>(output, input, weights,
+                                                                       stream, launch_with_pdl);
+    } else {
+      LoopUnroller<kBegin + 1, kEnd, kNumExperts, kHiddenDim>::unroll(
+          num_tokens, output, input, weights, stream, launch_with_pdl);
+    }
+  }
+};
+
+template <int kEnd, int kNumExperts, int kHiddenDim>
+struct LoopUnroller<kEnd, kEnd, kNumExperts, kHiddenDim> {
+  static void unroll(int num_tokens, float* output, __nv_bfloat16 const* input,
+                     __nv_bfloat16 const* weights, cudaStream_t stream, bool launch_with_pdl) {
+    if (num_tokens == kEnd) {
+      invokeRouterGemm<__nv_bfloat16, kEnd, kNumExperts, kHiddenDim>(output, input, weights, stream,
+                                                                     launch_with_pdl);
+    } else {
+      throw std::invalid_argument("Invalid num_tokens, only supports 1 to 16");
+    }
+  }
+};
+
+void dsv3_router_gemm_op(TensorView mat_a, TensorView mat_b, TensorView out, bool launch_with_pdl) {
+  int const num_tokens = mat_a.sizes()[0];
+  int const num_experts = mat_b.sizes()[1];
+  int const hidden_dim = mat_a.sizes()[1];
+  auto const out_dtype_ = out.dtype();
+  auto const data_type = mat_a.dtype();
+  constexpr int kNumExperts = 256;
+  constexpr int kHiddenDim = 7168;
+  std::vector<int64_t> output_size = {mat_a.sizes()[0], mat_b.sizes()[1]};
+  TVM_FFI_ICHECK(mat_a.dim() == 2 && mat_b.dim() == 2) << "mat_a and mat_b must be 2D tensors";
+  TVM_FFI_ICHECK(mat_a.strides()[1] == 1 && out.strides()[1] == 1)
+      << "mat_a and out must be row-major";
+  TVM_FFI_ICHECK(mat_b.strides()[0] == 1) << "mat_b must be column-major";
+  auto stream = get_stream(mat_a.device());
+  bool use_custom_kernel = false;
+  if (num_tokens >= 1 && num_tokens <= 16 && num_experts == kNumExperts &&
+      hidden_dim == kHiddenDim && encode_dlpack_dtype(data_type) == bfloat16_code &&
+      encode_dlpack_dtype(out_dtype_) == float32_code) {
+    use_custom_kernel = true;
+  }
+
+  if (use_custom_kernel) {
+    LoopUnroller<1, 16, kNumExperts, kHiddenDim>::unroll(
+        num_tokens, reinterpret_cast<float*>(out.data_ptr()),
+        reinterpret_cast<__nv_bfloat16 const*>(mat_a.data_ptr()),
+        reinterpret_cast<__nv_bfloat16 const*>(mat_b.data_ptr()), stream, launch_with_pdl);
+  } else {
+    TVM_FFI_LOG_AND_THROW(NotImplementedError) << "Unsupported input tensor size";
+  }
+}
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(dsv3_router_gemm_op,
+                              flashinfer::trtllm_dsv3_router_gemm::dsv3_router_gemm_op);
+
+}  // namespace flashinfer::trtllm_dsv3_router_gemm
diff --git a/csrc/flashinfer_page_binding.cu b/csrc/flashinfer_page_binding.cu
index dbab4f5cb8..97105712f7 100644
--- a/csrc/flashinfer_page_binding.cu
+++ b/csrc/flashinfer_page_binding.cu
@@ -27,12 +27,5 @@ void append_paged_mla_kv_cache(TensorView append_ckv, TensorView append_kpe,
                                TensorView kpe_cache, TensorView kv_indices, TensorView kv_indptr,
                                TensorView kv_last_page_len);
 
-void block_sparse_indices_to_vector_sparse_offsets(
-    TensorView block_sparse_indices, TensorView block_sparse_indptr,
-    TensorView vector_sparse_offsets, TensorView vector_sparse_indptr, TensorView kv_len_arr,
-    int64_t stride_block, int64_t stride_n, int64_t batch_size, int64_t block_size);
-
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(append_paged_kv_cache, append_paged_kv_cache);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(append_paged_mla_kv_cache, append_paged_mla_kv_cache);
-TVM_FFI_DLL_EXPORT_TYPED_FUNC(block_sparse_indices_to_vector_sparse_offsets,
-                              block_sparse_indices_to_vector_sparse_offsets);
diff --git a/csrc/flashinfer_rope_binding.cu b/csrc/flashinfer_rope_binding.cu
index 23124064d8..94809da735 100644
--- a/csrc/flashinfer_rope_binding.cu
+++ b/csrc/flashinfer_rope_binding.cu
@@ -45,9 +45,19 @@ void rope_quantize(TensorView q_rope_in, TensorView k_rope_in, TensorView q_nope
                    TensorView pos_ids, double quant_scale_q, double quant_scale_kv, bool interleave,
                    bool enable_pdl);
 
+void rope_quantize_append_paged_kv_cache(
+    TensorView q_rope_in, TensorView k_rope_in, TensorView q_nope_in, TensorView k_nope_in,
+    TensorView v_in, TensorView q_rope_out, TensorView q_nope_out, TensorView cos_sin_cache,
+    TensorView pos_ids, TensorView k_cache, TensorView v_cache, TensorView ckv_cache,
+    TensorView kpe_cache, TensorView kv_indices, TensorView kv_indptr, TensorView batch_indices,
+    TensorView positions, int64_t kv_layout_code, int64_t page_size, double quant_scale_q,
+    double quant_scale_kv, bool interleave, bool enable_pdl);
+
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(apply_rope, apply_rope);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(apply_llama31_rope, apply_llama31_rope);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(apply_rope_pos_ids, apply_rope_pos_ids);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(apply_llama31_rope_pos_ids, apply_llama31_rope_pos_ids);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(apply_rope_pos_ids_cos_sin_cache, apply_rope_pos_ids_cos_sin_cache);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(rope_quantize, rope_quantize);
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(rope_quantize_append_paged_kv_cache,
+                              rope_quantize_append_paged_kv_cache);
diff --git a/csrc/flashinfer_xqa_binding.cu b/csrc/flashinfer_xqa_binding.cu
index 003a23a5f6..8bcbafafd6 100644
--- a/csrc/flashinfer_xqa_binding.cu
+++ b/csrc/flashinfer_xqa_binding.cu
@@ -17,37 +17,26 @@
 #include "tvm_ffi_utils.h"
 
 #if MLA_WRAPPER
-void xqa_wrapper_mla(int64_t multiProcessorCount, double qScale, TensorView output, TensorView q,
-#if PAGED_KV_CACHE_LAYOUT == 1
-                     TensorView kCacheVLLM, TensorView vCacheVLLM,
-#else
-                     TensorView pool,
-#endif
-                     TensorView kvCachePageList, int64_t maxSeqLen, TensorView seqLen,
-                     int64_t batchSize, TensorView kvCacheScale, TensorView semaphores,
-                     TensorView scratch);
+void xqa_wrapper_mla(int64_t multiProcessorCount, double qScale,
+                     tvm::ffi::Optional<TensorView> qScaleTensor, TensorView output, TensorView q,
+                     TensorView kCacheVLLM, TensorView vCacheVLLM, TensorView kvCachePageList,
+                     int64_t maxSeqLen, TensorView seqLen, int64_t batchSize, double kvCacheScale,
+                     tvm::ffi::Optional<TensorView> kvScaleTensor, TensorView semaphores,
+                     TensorView scratch, bool enable_pdl);
 
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(xqa_wrapper_mla, xqa_wrapper_mla);
 
 #else
 
 void xqa_wrapper(bool run_sm90_fp8_mha, int64_t multiProcessorCount, int64_t nbKHeads,
-                 int64_t slidingWinSize, double qScale, TensorView output,
-#if LOW_PREC_OUTPUT
-                 TensorView rcpOutScale,
-#endif
-                 TensorView q, tvm::ffi::Optional<TensorView> attentionSinks,
-#if PAGED_KV_CACHE_LAYOUT == 1
-                 TensorView kCacheVLLM, TensorView vCacheVLLM,
-#else
-                 TensorView pool,
-#endif
-                 TensorView kvCachePageList, int64_t maxSeqLen, TensorView seqLen,
-                 int64_t batchSize, TensorView kvCacheScale,
-#if SPEC_DEC
-                 int64_t qSeqLen, TensorView qCuSeqLens, TensorView mask,
-#endif
-                 TensorView semaphores, TensorView scratch);
+                 int64_t slidingWinSize, double qScale, tvm::ffi::Optional<TensorView> qScaleTensor,
+                 TensorView output, double rcpOutScale, TensorView q,
+                 tvm::ffi::Optional<TensorView> attentionSinks, TensorView kCacheVLLM,
+                 TensorView vCacheVLLM, TensorView kvCachePageList, int64_t maxSeqLen,
+                 TensorView seqLen, int64_t batchSize, double kvCacheScale,
+                 tvm::ffi::Optional<TensorView> kvScaleTensor, int64_t qSeqLen,
+                 tvm::ffi::Optional<TensorView> mask, TensorView semaphores, TensorView scratch,
+                 bool enable_pdl);
 
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(xqa_wrapper, xqa_wrapper);
 
diff --git a/csrc/fmhaReduction.cu b/csrc/fmhaReduction.cu
index 1f1ca8c755..e329e1c14b 100644
--- a/csrc/fmhaReduction.cu
+++ b/csrc/fmhaReduction.cu
@@ -34,7 +34,7 @@ namespace kernels {
 template <int32_t TileSizePerCtaQ, int32_t HeadDim, int32_t HeadDimPerCta, bool IsE4m3Bmm,
           typename DtypeO, typename DtypePartialO>
 __global__ void __launch_bounds__(NumThreadsPerCta, 2)
-    fmhaReductionKernel(KernelParams const params, int32_t numCtasForReduction,
+    fmhaReductionKernel(KernelParams const params, bool sparseMla, int32_t numCtasForReduction,
                         int32_t numCtasForAllHeads, int32_t numHeadDimCtasV) {
   // clang-format off
   // The shape of partialO buffer: [batchSize, numHeadCtas, numCtasQ, numCtasKv, TileSizePerCtaQ, headDimPerCta].
@@ -64,10 +64,25 @@ __global__ void __launch_bounds__(NumThreadsPerCta, 2)
 
   // The number of validRows.
   int32_t const numValidRows{TileSizePerCtaQ};
+  // The seqOffsetQ.
+  int32_t const seqOffsetQ{params.ptrCumSeqLensQ == nullptr ? batchIdx * params.mMaxSeqLenQ
+                                                            : params.ptrCumSeqLensQ[batchIdx]};
+  // The seqLenQ.
+  int32_t const seqLenQ{params.ptrCumSeqLensQ == nullptr
+                            ? params.mMaxSeqLenQ
+                            : (params.ptrCumSeqLensQ[batchIdx + 1] - seqOffsetQ)};
+  // Early exit if ctaIdxQ >= seqLenQ, where each CTA processes one tokenQ.
+  if (ctaIdxQ >= seqLenQ) {
+    return;
+  }
   // The actual number of seqLenKv.
   int32_t seqLenKv{params.ptrSeqLensKv[batchIdx]};
   // Consider the causal-mask speculative decoding.
   seqLenKv = seqLenKv - ((params.mMaxSeqLenQ - 1) - ctaIdxQ);
+  // Consider sparseMlaTopK.
+  if (sparseMla) {
+    seqLenKv = min(seqLenKv, params.mSparseMlaTopK);
+  }
   // The actual number of CtasKv (TileSizeKv is always 128 for now).
   int32_t numCtasKv{min((seqLenKv + 127) / 128, params.mMaxNumCtasKv)};
 
@@ -336,7 +351,7 @@ void runFmhaReduction(TllmGenFmhaKernelMetaInfo const& kernelMeta, KernelParams
   config.numAttrs = 1;
 
   // Select the kernel function pointer.
-  void (*kernel)(KernelParams const, int32_t, int32_t, int32_t) = nullptr;
+  void (*kernel)(KernelParams const, bool, int32_t, int32_t, int32_t) = nullptr;
   if (headDimPerCtaV == 128) {
     SELECT_FMHA_REDUCTION_KERNEL(128);
   } else if (headDimPerCtaV == 256) {
@@ -346,8 +361,8 @@ void runFmhaReduction(TllmGenFmhaKernelMetaInfo const& kernelMeta, KernelParams
   }
 
   // Launch the kernel.
-  cudaLaunchKernelEx(&config, kernel, params, numCtasForReduction, numCtasForAllHeads,
-                     numHeadDimCtasV);
+  cudaLaunchKernelEx(&config, kernel, params, kernelMeta.mSparseMla, numCtasForReduction,
+                     numCtasForAllHeads, numHeadDimCtasV);
   cudaError_t err = cudaGetLastError();
   FLASHINFER_CHECK(err == cudaSuccess, "Failed to launch kernel: ", cudaGetErrorString(err));
 }
diff --git a/csrc/fmha_cutlass_sm100.cu b/csrc/fmha_cutlass_sm100.cu
index c50116fa7f..08f1235adf 100644
--- a/csrc/fmha_cutlass_sm100.cu
+++ b/csrc/fmha_cutlass_sm100.cu
@@ -96,7 +96,7 @@ void FMHACutlassSM100Run(ffi::TensorView workspace_buffer, ffi::TensorView q, ff
   int v_stride_n = v.stride(0);
   int v_stride_h = v.stride(1);
 
-  cudaSetDevice(qo_segment_offsets.device().device_id);
+  ffi::CUDADeviceGuard device_guard(qo_segment_offsets.device().device_id);
   const cudaStream_t stream = get_stream(o.device());
 
   DISPATCH_context(DTypeIn, DTypeOut, HEAD_DIM_QK, HEAD_DIM_VO, MASK_MODE, [&] {
diff --git a/csrc/fmha_v2/convert.cu b/csrc/fmha_v2/convert.cu
new file mode 100644
index 0000000000..345bd008f9
--- /dev/null
+++ b/csrc/fmha_v2/convert.cu
@@ -0,0 +1,196 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <fmha/numeric_types.h>
+#include <fmha/utils.h>
+#include <stdint.h>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__global__ void convert_int32_to_int8_kernel(void* dst, void const* src, size_t n, float scale) {
+  // The step.
+  size_t step = (size_t)gridDim.x * blockDim.x;
+
+  // Iterate over the elements.
+  for (size_t ii = blockIdx.x * blockDim.x + threadIdx.x; ii < n / 4; ii += step) {
+    // Load 4 integers.
+    int4 tmp = reinterpret_cast<int4 const*>(src)[ii];
+
+    // Convert to float and scale.
+    float x = static_cast<float>(tmp.x) * scale;
+    float y = static_cast<float>(tmp.y) * scale;
+    float z = static_cast<float>(tmp.z) * scale;
+    float w = static_cast<float>(tmp.w) * scale;
+
+    // Convert to int8.
+    uint32_t a;
+    asm volatile("cvt.rni.sat.s8.f32 %0, %1;\n" : "=r"(a) : "f"(x));
+    uint32_t b;
+    asm volatile("cvt.rni.sat.s8.f32 %0, %1;\n" : "=r"(b) : "f"(y));
+    uint32_t c;
+    asm volatile("cvt.rni.sat.s8.f32 %0, %1;\n" : "=r"(c) : "f"(z));
+    uint32_t d;
+    asm volatile("cvt.rni.sat.s8.f32 %0, %1;\n" : "=r"(d) : "f"(w));
+
+    // Compact.
+    char4 out;
+    out.x = reinterpret_cast<int8_t const&>(a);
+    out.y = reinterpret_cast<int8_t const&>(b);
+    out.z = reinterpret_cast<int8_t const&>(c);
+    out.w = reinterpret_cast<int8_t const&>(d);
+
+    // Store.
+    reinterpret_cast<uint32_t*>(dst)[ii] = reinterpret_cast<uint32_t const&>(out);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_conversion_int32_to_int8(void* dst, void const* src, int s, int b, int h, int d,
+                                  float scale) {
+  size_t n = (size_t)s * b * h * d;
+  convert_int32_to_int8_kernel<<<512, 256>>>(dst, src, n, scale);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+__device__ inline typename fmha::Uint_from_size_in_bytes<sizeof(T) * 4>::Type pack_float4(
+    float4 const& f);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+__device__ inline uint2 pack_float4<fmha::fp16_t>(float4 const& f) {
+  return fmha::float4_to_half4(f.x, f.y, f.z, f.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+__device__ inline uint2 pack_float4<fmha::bf16_t>(float4 const& f) {
+  return fmha::float4_to_16bit_x4<fmha::bf16_t>(f.x, f.y, f.z, f.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+__device__ inline uint32_t pack_float4<fmha::e4m3_t>(float4 const& f) {
+  return fmha::float4_to_e4m3x4(f.x, f.y, f.z, f.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <>
+__device__ inline uint32_t pack_float4<fmha::e5m2_t>(float4 const& f) {
+  return fmha::float4_to_e5m2x4(f.x, f.y, f.z, f.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+__global__ void convert_fp32_to_T_kernel(void* dst, void const* src, size_t n, float scale = 1.f) {
+  using Dst = typename fmha::Uint_from_size_in_bytes<sizeof(T) * 4>::Type;
+
+  // The step.
+  size_t step = (size_t)gridDim.x * blockDim.x;
+
+  // Iterate over the elements.
+  for (size_t ii = blockIdx.x * blockDim.x + threadIdx.x; ii < n / 4; ii += step) {
+    // Load 4 floats.
+    float4 tmp = reinterpret_cast<float4 const*>(src)[ii];
+    // Scale.
+    tmp.x *= scale;
+    tmp.y *= scale;
+    tmp.z *= scale;
+    tmp.w *= scale;
+    // Convert to 4 Ts.
+    auto out = pack_float4<T>(tmp);
+
+    // Store.
+    reinterpret_cast<Dst*>(dst)[ii] = reinterpret_cast<Dst const&>(out);
+  }
+}
+
+template <typename T>
+__global__ void convert_T_to_fp32_kernel(void* dst, void const* src, size_t n, float scale = 1.f) {
+  using Src = typename fmha::Uint_from_size_in_bytes<sizeof(T) * 4>::Type;
+
+  union {
+    Src raw;
+    T elt[4];
+  } data;
+
+  // The step.
+  size_t step = (size_t)gridDim.x * blockDim.x;
+
+  // Iterate over the elements.
+  for (size_t ii = blockIdx.x * blockDim.x + threadIdx.x; ii < n / 4; ii += step) {
+    // Load 4 floats.
+    data.raw = reinterpret_cast<Src const*>(src)[ii];
+    float4 out;
+    // Scale.
+    out.x = float(data.elt[0]) * scale;
+    out.y = float(data.elt[1]) * scale;
+    out.z = float(data.elt[2]) * scale;
+    out.w = float(data.elt[3]) * scale;
+
+    // Store.
+    reinterpret_cast<float4*>(dst)[ii] = reinterpret_cast<float4 const&>(out);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_conversion_fp32_to_fp16(void* dst, void const* src, int s, int b, int h, int d) {
+  // No need to expose the scale factor for FP16/FP32.
+  size_t n = (size_t)s * b * h * d;
+  convert_fp32_to_T_kernel<fmha::fp16_t><<<512, 256>>>(dst, src, n, 1.f);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_conversion_fp32_to_bf16(void* dst, void const* src, int s, int b, int h, int d) {
+  // No need to expose the scale factor for FP16/FP32.
+  size_t n = (size_t)s * b * h * d;
+  convert_fp32_to_T_kernel<fmha::bf16_t><<<512, 256>>>(dst, src, n, 1.f);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_conversion_fp32_to_e4m3(void* dst, void const* src, size_t n, float scale_o) {
+  convert_fp32_to_T_kernel<fmha::e4m3_t><<<512, 256>>>(dst, src, n, scale_o);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_conversion_e4m3_to_fp32(void* dst, void const* src, size_t n, float scale_o) {
+  convert_T_to_fp32_kernel<fmha::e4m3_t><<<512, 256>>>(dst, src, n, scale_o);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_conversion_fp32_to_e4m3(void* dst, void const* src, int s, int b, int h, int d,
+                                 float scale_o) {
+  run_conversion_fp32_to_e4m3(dst, src, s * b * h * d, scale_o);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_conversion_fp32_to_e5m2(void* dst, void const* src, size_t n, float scale_o) {
+  convert_fp32_to_T_kernel<fmha::e5m2_t><<<512, 256>>>(dst, src, n, scale_o);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_conversion_e5m2_to_fp32(void* dst, void const* src, size_t n, float scale_o) {
+  convert_T_to_fp32_kernel<fmha::e5m2_t><<<512, 256>>>(dst, src, n, scale_o);
+}
diff --git a/csrc/fmha_v2/fmha/alibi_params.h b/csrc/fmha_v2/fmha/alibi_params.h
new file mode 100644
index 0000000000..bee7ea1be9
--- /dev/null
+++ b/csrc/fmha_v2/fmha/alibi_params.h
@@ -0,0 +1,50 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+namespace fmha {
+
+struct AlibiParams {
+  constexpr static int round_down_to_power_two(int x) {
+    x = x | (x >> 1);
+    x = x | (x >> 2);
+    x = x | (x >> 4);
+    x = x | (x >> 8);
+    x = x | (x >> 16);
+    return x - (x >> 1);
+  }
+
+  AlibiParams() = default;
+
+  AlibiParams(int h, float scale_after_alibi = 1.f) : scale_after_alibi(scale_after_alibi) {
+    h_pow_2 = round_down_to_power_two(h);
+    alibi_neg4_div_h = -4.0f / h_pow_2;
+  }
+
+  AlibiParams(int h, int s, int tp_size, int rank, float scale_after_alibi = 1.f)
+      : AlibiParams(h * tp_size, scale_after_alibi) {
+    head_idx_offset = h * rank;
+    sequence_pos_offset = s * rank;
+  }
+
+  int h_pow_2{};
+  float alibi_neg4_div_h{};
+  float scale_after_alibi{};
+  // Could be simplified to `int rank` derive the others as `num_heads * rank, s * rank` at
+  // runtime, but this makes assumptions about the layout downstream
+  // (e.g. downstream may only split across the head dimension, so s would be the full sequence)
+  int head_idx_offset = 0;
+  int sequence_pos_offset = 0;
+};
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/fragment.h b/csrc/fmha_v2/fmha/fragment.h
new file mode 100644
index 0000000000..01bdc0fdac
--- /dev/null
+++ b/csrc/fmha_v2/fmha/fragment.h
@@ -0,0 +1,2311 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/traits.h>
+#include <fmha/utils.h>
+
+#include <cfloat>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int BYTES_PER_LDG>
+struct Fragment_ldg {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_ldg<1> {
+  template <typename Fragment>
+  static inline __device__ void ldg(Fragment& f, int ii, void const* ptr) {
+    uint8_t tmp;
+    fmha::ldg(tmp, ptr);
+    f.u8(ii) = tmp;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_ldg<2> {
+  template <typename Fragment>
+  static inline __device__ void ldg(Fragment& f, int ii, void const* ptr) {
+    uint16_t tmp;
+    fmha::ldg(tmp, ptr);
+    f.u16(ii) = tmp;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_ldg<4> {
+  template <typename Fragment>
+  static inline __device__ void ldg(Fragment& f, int ii, void const* ptr) {
+    uint32_t tmp;
+    fmha::ldg(tmp, ptr);
+    f.reg(ii) = tmp;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_ldg<8> {
+  template <typename Fragment>
+  static inline __device__ void ldg(Fragment& f, int ii, void const* ptr) {
+    uint2 tmp;
+    fmha::ldg(tmp, ptr);
+    f.reg(2 * ii + 0) = tmp.x;
+    f.reg(2 * ii + 1) = tmp.y;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_ldg<16> {
+  template <typename Fragment>
+  static inline __device__ void ldg(Fragment& f, int ii, void const* ptr) {
+    uint4 tmp;
+    fmha::ldg(tmp, ptr);
+    f.reg(4 * ii + 0) = tmp.x;
+    f.reg(4 * ii + 1) = tmp.y;
+    f.reg(4 * ii + 2) = tmp.z;
+    f.reg(4 * ii + 3) = tmp.w;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int BYTES_PER_LDS>
+struct Fragment_lds {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_lds<2> {
+  template <typename Fragment>
+  static inline __device__ void lds(Fragment& f, int ii, uint32_t ptr) {
+    uint16_t tmp;
+    fmha::lds(tmp, ptr);
+    f.u16(ii) = tmp;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_lds<4> {
+  template <typename Fragment>
+  static inline __device__ void lds(Fragment& f, int ii, uint32_t ptr) {
+    uint32_t tmp;
+    fmha::lds(tmp, ptr);
+    f.reg(ii) = tmp;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_lds<8> {
+  template <typename Fragment>
+  static inline __device__ void lds(Fragment& f, int ii, uint32_t ptr) {
+    uint2 tmp;
+    fmha::lds(tmp, ptr);
+    f.reg(2 * ii + 0) = tmp.x;
+    f.reg(2 * ii + 1) = tmp.y;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_lds<16> {
+  template <typename Fragment>
+  static inline __device__ void lds(Fragment& f, int ii, uint32_t ptr) {
+    uint4 tmp;
+    fmha::lds(tmp, ptr);
+    f.reg(4 * ii + 0) = tmp.x;
+    f.reg(4 * ii + 1) = tmp.y;
+    f.reg(4 * ii + 2) = tmp.z;
+    f.reg(4 * ii + 3) = tmp.w;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// template<>
+// struct Fragment_lds<32> {
+//     template< typename Fragment >
+//     static inline __device__ void lds(Fragment &f, int ii, uint32_t ptr) {
+//         uint4 tmp;
+//         fmha::lds(tmp, ptr);
+//         f.reg(8*ii+0) = tmp.x;
+//         f.reg(8*ii+1) = tmp.y;
+//         f.reg(8*ii+2) = tmp.z;
+//         f.reg(8*ii+3) = tmp.w;
+//
+//         fmha::lds(tmp, static_cast<const char*>(ptr)+sizeof(uint4));
+//         f.reg(8*ii+4) = tmp.x;
+//         f.reg(8*ii+5) = tmp.y;
+//         f.reg(8*ii+6) = tmp.z;
+//         f.reg(8*ii+7) = tmp.w;
+//     }
+// };
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int BYTES_PER_STG>
+struct Fragment_stg {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_stg<1> {
+  template <typename Fragment>
+  static inline __device__ void stg(void* ptr, Fragment const& f, int ii = 0) {
+    fmha::stg(ptr, f.u8(ii));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_stg<2> {
+  template <typename Fragment>
+  static inline __device__ void stg(void* ptr, Fragment const& f, int ii = 0) {
+    fmha::stg(ptr, f.u16(ii));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_stg<4> {
+  template <typename Fragment>
+  static inline __device__ void stg(void* ptr, Fragment const& f, int ii = 0) {
+    fmha::stg(ptr, f.reg(ii));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_stg<8> {
+  template <typename Fragment>
+  static inline __device__ void stg(void* ptr, Fragment const& f, int ii = 0) {
+    uint2 tmp;
+    tmp.x = f.reg(2 * ii + 0);
+    tmp.y = f.reg(2 * ii + 1);
+    fmha::stg(ptr, tmp);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_stg<16> {
+  template <typename Fragment>
+  static inline __device__ void stg(void* ptr, Fragment const& f, int ii = 0) {
+    uint4 tmp;
+    tmp.x = f.reg(4 * ii + 0);
+    tmp.y = f.reg(4 * ii + 1);
+    tmp.z = f.reg(4 * ii + 2);
+    tmp.w = f.reg(4 * ii + 3);
+    fmha::stg(ptr, tmp);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Data_type_, int NUM_ELTS_, int BITS_PER_ELT_, int ALIGNMENT_>
+struct Fragment_base_ {
+  // The data type.
+  using Data_type = Data_type_;
+  // default input type
+  using Input_type_ = Data_type_;
+
+  // Does it store the array of elements.
+  enum { HAS_ELTS = BITS_PER_ELT_ >= 8 };
+
+  // The number of elements.
+  enum { NUM_ELTS = NUM_ELTS_ };
+
+  // The size of element in bits.
+  enum { BITS_PER_ELT = BITS_PER_ELT_ };
+
+  // The size of byte of a single register.
+  enum { BYTES_PER_REG = 4 };
+
+  // The size in bits.
+  enum { BITS_PER_REG = BYTES_PER_REG * 8 };
+
+  // The number of registers needed to store the fragment.
+  enum { NUM_REGS = Div_up<NUM_ELTS * BITS_PER_ELT, BITS_PER_REG>::VALUE };
+
+  // The size in bytes (as returned by sizeof(Fragment_base<>).
+  enum { SIZE_IN_BYTES = NUM_REGS * BYTES_PER_REG };
+
+  // The alignment.
+  enum { ALIGNMENT = ALIGNMENT_ > 0 ? ALIGNMENT_ : Min<NUM_REGS * BYTES_PER_REG, 16>::VALUE };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The type of the elements.
+    typename Data_type_,
+    // The number of elements.
+    int NUM_ELTS_,
+    // The size of each element in bits.
+    int BITS_PER_ELT_,
+    // The alignment if you want to force a value -- use 0 otherwise.
+    int ALIGNMENT_,
+    // The base class.
+    typename Base_ = Fragment_base_<Data_type_, NUM_ELTS_, BITS_PER_ELT_, ALIGNMENT_>>
+struct alignas(static_cast<int>(Base_::ALIGNMENT)) Fragment_base : public Base_ {
+  // The size of a load/store.
+  enum { BYTES_PER_LOAD_STORE = Base_::NUM_REGS * sizeof(uint32_t) };
+
+  // Clear the fragment. Using PTX in that code seems to produce better SASS...
+  inline __device__ void clear() {
+#pragma unroll
+    for (int ii = 0; ii < Base_::NUM_REGS; ++ii) {
+      asm volatile("mov.u32 %0, 0; \n" : "=r"(this->reg(ii)) :);
+    }
+  }
+
+  // Load from global memory.
+  inline __device__ void ldg(void const* ptr) {
+    Fragment_ldg<Base_::SIZE_IN_BYTES>::ldg(*this, 0, ptr);
+  }
+
+  // Load from shared memory.
+  inline __device__ void lds(uint32_t ptr) {
+    Fragment_lds<Base_::SIZE_IN_BYTES>::lds(*this, 0, ptr);
+  }
+
+  // Immutable access to a register.
+  inline __device__ uint32_t const& reg(int ii) const { return this->regs_[ii]; }
+
+  // Mutable access to a register.
+  inline __device__ uint32_t& reg(int ii) { return this->regs_[ii]; }
+
+  // Set the fragment with a scalar
+  inline __device__ void set(uint32_t value) {
+#pragma unroll
+    for (int ii = 0; ii < Base_::NUM_REGS; ++ii) {
+      this->reg(ii) = value;
+    }
+  }
+
+  // Store to global memory.
+  inline __device__ void stg(void* ptr) const {
+    Fragment_stg<Base_::SIZE_IN_BYTES>::stg(ptr, *this, 0);
+  }
+
+  // Immutable access to a byte.
+  inline __device__ uint8_t u8(int ii) const {
+    return reinterpret_cast<uint8_t const*>(&this->regs_[0])[ii];
+  }
+
+  // Mutable access to a u8.
+  inline __device__ uint8_t& u8(int ii) { return reinterpret_cast<uint8_t*>(&this->regs_[0])[ii]; }
+
+  // Immutable access to a half-word..
+  inline __device__ uint16_t u16(int ii) const {
+    return reinterpret_cast<uint16_t const*>(&this->regs_[0])[ii];
+  }
+
+  // Mutable access to a half-word.
+  inline __device__ uint16_t& u16(int ii) {
+    return reinterpret_cast<uint16_t*>(&this->regs_[0])[ii];
+  }
+
+  // Immutable access to a word.
+  inline __device__ uint32_t u32(int ii) const {
+    return reinterpret_cast<uint32_t const*>(&this->regs_[0])[ii];
+  }
+
+  // Mutable access to a word.
+  inline __device__ uint32_t& u32(int ii) {
+    return reinterpret_cast<uint32_t*>(&this->regs_[0])[ii];
+  }
+
+  // Immutable access to a word.
+  inline __device__ uint2 u64(int ii) const {
+    return reinterpret_cast<uint2 const*>(&this->regs_[0])[ii];
+  }
+
+  // Mutable access to a word.
+  inline __device__ uint2& u64(int ii) { return reinterpret_cast<uint2*>(&this->regs_[0])[ii]; }
+
+  // The storage in registers.
+  //
+  // NOTE: Instead of using only an array of uint32_t, we could use a union so we could either
+  // access the registers or the elements. We found that for:
+  //
+  // union {
+  //   uint16_t elts_[4]; uint32_t regs_[2];
+  // };
+  //
+  // The compiler does not always produce a final structure of 8B. So, for the moment we are
+  // going to go only with the regs_ array and use reinterpret_cast<> to access elements (see
+  // below). It may be worth revisiting that when time permits.
+  uint32_t regs_[Base_::NUM_REGS];
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Data_type_, int NUM_ELTS_, int ALIGNMENT_ = 0>
+struct Fragment : public Fragment_base<Data_type_, NUM_ELTS_, 8 * sizeof(Data_type_), ALIGNMENT_> {
+  // Immutable access to the elements.
+  inline __device__ Data_type_ const& elt(int ii) const {
+    return reinterpret_cast<Data_type_ const*>(&this->regs_[0])[ii];
+  }
+
+  // Mutable access to the elements.
+  inline __device__ Data_type_& elt(int ii) {
+    return reinterpret_cast<Data_type_*>(&this->regs_[0])[ii];
+  }
+
+  // Immutable access to the elements with a cast.
+  template <typename Cast_type>
+  inline __device__ Cast_type const& elt_as(int ii) const {
+    return reinterpret_cast<Cast_type const*>(&this->regs_[0])[ii];
+  }
+
+  // Mutable access to the elements.
+  template <typename Cast_type>
+  inline __device__ Cast_type& elt_as(int ii) {
+    return reinterpret_cast<Cast_type*>(&this->regs_[0])[ii];
+  }
+
+  // Add another fragment.
+  inline __device__ void add(Fragment const& other) {
+#pragma unroll
+    for (int ii = 0; ii < NUM_ELTS_; ++ii) {
+      this->elt(ii) += other.elt(ii);
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Layout>
+struct Fragment_a {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_a<Volta_hmma_fp16_traits, Layout> : public Fragment<uint16_t, 8> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_a<Volta_imma_int8_int32_traits, Layout> : public Fragment<int8_t, 8> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_a<Turing_hmma_fp16_traits, Layout> : public Fragment<uint16_t, 4> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_a<Turing_hmma_fp32_traits, Layout> : public Fragment<uint16_t, 4> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_a<Turing_imma_int8_int32_traits, Layout> : public Fragment<int8_t, 8> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_a<Ampere_hmma_fp16_traits, Layout> : public Fragment<uint16_t, 8> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_a<Ampere_hmma_bf16_bf16_traits, Layout> : public Fragment<uint16_t, 8> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_a<Ampere_hmma_fp32_traits, Layout> : public Fragment<uint16_t, 8> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_a<Ampere_hmma_bf16_traits, Layout> : public Fragment<uint16_t, 8> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_a<Ampere_imma_int8_int32_traits, Layout> : public Fragment<int8_t, 16> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_a<Ada_qmma_e4m3_fp32_traits, Layout> : public Fragment<e4m3_t, 16> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_a<Ada_qmma_e4m3_fp16_traits, Layout> : public Fragment<e4m3_t, 16> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Layout>
+struct Fragment_b {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_b<Volta_hmma_fp16_traits, Layout> : public Fragment<uint16_t, 8> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_b<Volta_hmma_fp16_16x16x16_traits, Layout> : public Fragment<uint16_t, 16> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_b<Volta_imma_int8_int32_traits, Layout> : public Fragment<int8_t, 8> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_b<Turing_hmma_fp16_traits, Layout> : public Fragment<uint16_t, 4> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_b<Turing_hmma_fp32_traits, Layout> : public Fragment<uint16_t, 4> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_b<Turing_imma_int8_int32_traits, Layout> : public Fragment<int8_t, 8> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_b<Ampere_hmma_fp16_traits, Layout> : public Fragment<uint16_t, 8> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_b<Ampere_hmma_bf16_bf16_traits, Layout> : public Fragment<uint16_t, 8> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_b<Ampere_hmma_fp32_traits, Layout> : public Fragment<uint16_t, 8> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_b<Ampere_hmma_bf16_traits, Layout> : public Fragment<uint16_t, 8> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_b<Ampere_imma_int8_int32_traits, Layout> : public Fragment<int8_t, 16> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_b<Ada_qmma_e4m3_fp32_traits, Layout> : public Fragment<e4m3_t, 16> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_b<Ada_qmma_e4m3_fp16_traits, Layout> : public Fragment<e4m3_t, 16> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits>
+struct Fragment_accumulator {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_accumulator<Volta_hmma_fp16_traits> : public Fragment<uint16_t, 8> {
+  // The traits.
+  using Traits = Volta_hmma_fp16_traits;
+  // The base class.
+  using Base = Fragment<uint16_t, 8>;
+
+  // The fragments.
+  using Fragment_a = Fragment_a<Traits, Row>;
+  using Fragment_b = Fragment_b<Traits, Col>;
+
+  // HMMA.
+  inline __device__ void mma(Fragment_a const& a, Fragment_b const& b) {
+    asm volatile(
+        "mma.sync.aligned.m8n8k4.row.col.f16.f16.f16.f16 \n"
+        "    {%0, %1, %2, %3}, \n"
+        "    {%4, %5}, \n"
+        "    {%6, %7}, \n"
+        "    {%0, %1, %2, %3}; \n"
+        : "+r"(this->reg(0)), "+r"(this->reg(1)), "+r"(this->reg(2)), "+r"(this->reg(3))
+        : "r"(a.reg(0)), "r"(a.reg(1)), "r"(b.reg(0)), "r"(b.reg(1)));
+    asm volatile(
+        "mma.sync.aligned.m8n8k4.row.col.f16.f16.f16.f16 \n"
+        "    {%0, %1, %2, %3}, \n"
+        "    {%4, %5}, \n"
+        "    {%6, %7}, \n"
+        "    {%0, %1, %2, %3}; \n"
+        : "+r"(this->reg(0)), "+r"(this->reg(1)), "+r"(this->reg(2)), "+r"(this->reg(3))
+        : "r"(a.reg(2)), "r"(a.reg(3)), "r"(b.reg(2)), "r"(b.reg(3)));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_accumulator<Volta_hmma_fp16_16x16x16_traits> : public Fragment<uint16_t, 16> {
+  // The base class.
+  using Base = Fragment<uint16_t, 16>;
+
+  // The fragments.
+  using Fragment_a = Fragment_a<Volta_hmma_fp16_traits, Row>;
+  using Fragment_b = Fragment_b<Volta_hmma_fp16_16x16x16_traits, Row>;
+
+  // HMMA.
+  inline __device__ void mma(Fragment_a const& a, Fragment_b const& b) {
+    // K = 0..3 for threads 0..7 and 16..23 and K = 4..7 for 8..15 and 24..31.
+    asm volatile(
+        "mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16 \n"
+        "    {%0, %1, %2, %3}, \n"
+        "    {%4, %5}, \n"
+        "    {%6, %7}, \n"
+        "    {%0, %1, %2, %3}; \n"
+        : "+r"(this->reg(0)), "+r"(this->reg(1)), "+r"(this->reg(2)), "+r"(this->reg(3))
+        : "r"(a.reg(0)), "r"(a.reg(1)), "r"(b.reg(0)), "r"(b.reg(1)));
+    asm volatile(
+        "mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16 \n"
+        "    {%0, %1, %2, %3}, \n"
+        "    {%4, %5}, \n"
+        "    {%6, %7}, \n"
+        "    {%0, %1, %2, %3}; \n"
+        : "+r"(this->reg(4)), "+r"(this->reg(5)), "+r"(this->reg(6)), "+r"(this->reg(7))
+        : "r"(a.reg(0)), "r"(a.reg(1)), "r"(b.reg(2)), "r"(b.reg(3)));
+
+    // K = 8..11 for threads 0..7 and 16..23 and K = 12..15 for 8..15 and 24..31.
+    asm volatile(
+        "mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16 \n"
+        "    {%0, %1, %2, %3}, \n"
+        "    {%4, %5}, \n"
+        "    {%6, %7}, \n"
+        "    {%0, %1, %2, %3}; \n"
+        : "+r"(this->reg(0)), "+r"(this->reg(1)), "+r"(this->reg(2)), "+r"(this->reg(3))
+        : "r"(a.reg(2)), "r"(a.reg(3)), "r"(b.reg(4)), "r"(b.reg(5)));
+    asm volatile(
+        "mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16 \n"
+        "    {%0, %1, %2, %3}, \n"
+        "    {%4, %5}, \n"
+        "    {%6, %7}, \n"
+        "    {%0, %1, %2, %3}; \n"
+        : "+r"(this->reg(4)), "+r"(this->reg(5)), "+r"(this->reg(6)), "+r"(this->reg(7))
+        : "r"(a.reg(2)), "r"(a.reg(3)), "r"(b.reg(6)), "r"(b.reg(7)));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_accumulator<Volta_imma_int8_int32_traits> : public Fragment<int32_t, 8> {
+  // The base class.
+  using Base = Fragment<int32_t, 8>;
+
+  // The fragments.
+  using Fragment_a = Fragment_a<Volta_imma_int8_int32_traits, Row>;
+  using Fragment_b = Fragment_b<Volta_imma_int8_int32_traits, Col>;
+
+  // IMMA.
+  inline __device__ void mma(Fragment_a const& a, Fragment_b const& b) {
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 \n"
+          "    {%0, %1}, \n"
+          "    {%2}, \n"
+          "    {%3}, \n"
+          "    {%0, %1}; \n"
+          : "+r"(this->reg(2 * i + 0)), "+r"(this->reg(2 * i + 1))
+          : "r"(a.reg(i / 2)), "r"(b.reg(i % 2)));
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_accumulator<Turing_hmma_fp16_traits> : public Fragment<uint16_t, 8> {
+  // Do the HMMA.
+  template <typename Layout_a, typename Layout_b>
+  inline __device__ void mma(Fragment_a<Turing_hmma_fp16_traits, Layout_a> const& a,
+                             Fragment_b<Turing_hmma_fp16_traits, Layout_b> const& b) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 \n"
+        "    {%0, %1}, \n"
+        "    {%2, %3}, \n"
+        "    {%4}, \n"
+        "    {%0, %1}; \n"
+        : "+r"(reg(0)), "+r"(reg(1))
+        : "r"(a.reg(0)), "r"(a.reg(1)), "r"(b.reg(0)));
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 \n"
+        "    {%0, %1}, \n"
+        "    {%2, %3}, \n"
+        "    {%4}, \n"
+        "    {%0, %1}; \n"
+        : "+r"(reg(2)), "+r"(reg(3))
+        : "r"(a.reg(0)), "r"(a.reg(1)), "r"(b.reg(1)));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_accumulator<Turing_hmma_fp32_traits> : public Fragment<float, 8> {
+  // The base class.
+  using Base = Fragment<float, 8>;
+
+  // Add two fragments.
+  template <typename Other_fragment_>
+  inline __device__ void add(Other_fragment_ const& other) {
+    for (int ii = 0; ii < Base::NUM_ELTS; ++ii) {
+      this->elt(ii) = this->elt(ii) + other.elt(ii);
+    }
+  }
+
+  inline __device__ void mul(float const other) {
+    for (int ii = 0; ii < Base::NUM_ELTS; ++ii) {
+      this->elt(ii) *= other;
+    }
+  }
+
+  // Do the HMMA.
+  template <typename Layout_a, typename Layout_b>
+  inline __device__ void mma(Fragment_a<Turing_hmma_fp32_traits, Layout_a> const& a,
+                             Fragment_b<Turing_hmma_fp32_traits, Layout_b> const& b) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 \n"
+        "    {%0, %1, %2, %3}, \n"
+        "    {%4, %5}, \n"
+        "    {%6}, \n"
+        "    {%0, %1, %2, %3}; \n"
+        : "+f"(elt(0)), "+f"(elt(1)), "+f"(elt(2)), "+f"(elt(3))
+        : "r"(a.reg(0)), "r"(a.reg(1)), "r"(b.reg(0)));
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 \n"
+        "    {%0, %1, %2, %3}, \n"
+        "    {%4, %5}, \n"
+        "    {%6}, \n"
+        "    {%0, %1, %2, %3}; \n"
+        : "+f"(elt(4)), "+f"(elt(5)), "+f"(elt(6)), "+f"(elt(7))
+        : "r"(a.reg(0)), "r"(a.reg(1)), "r"(b.reg(1)));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_accumulator<Turing_imma_int8_int32_traits> : public Fragment<int32_t, 8> {
+  // The base class.
+  using Base = Fragment<int32_t, 8>;
+
+  // The fragments.
+  using Fragment_a = Fragment_a<Turing_imma_int8_int32_traits, Row>;
+  using Fragment_b = Fragment_b<Turing_imma_int8_int32_traits, Col>;
+
+  // IMMA.
+  inline __device__ void mma(Fragment_a const& a, Fragment_b const& b) {
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 \n"
+          "    {%0, %1}, \n"
+          "    {%2}, \n"
+          "    {%3}, \n"
+          "    {%0, %1}; \n"
+          : "+r"(this->reg(2 * i + 0)), "+r"(this->reg(2 * i + 1))
+          : "r"(a.reg(i / 2)), "r"(b.reg(i % 2)));
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_accumulator<Ampere_hmma_fp16_traits> : public Fragment<uint16_t, 8> {
+  // Do the HMMA.
+  template <typename Layout_a, typename Layout_b>
+  inline __device__ void mma(Fragment_a<Ampere_hmma_fp16_traits, Layout_a> const& a,
+                             Fragment_b<Ampere_hmma_fp16_traits, Layout_b> const& b) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 \n"
+        "    {%0, %1}, \n"
+        "    {%2, %3, %4, %5}, \n"
+        "    {%6, %7}, \n"
+        "    {%0, %1}; \n"
+        : "+r"(reg(0)), "+r"(reg(1))
+        : "r"(a.reg(0)), "r"(a.reg(1)), "r"(a.reg(2)), "r"(a.reg(3)), "r"(b.reg(0)), "r"(b.reg(1)));
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 \n"
+        "    {%0, %1}, \n"
+        "    {%2, %3, %4, %5}, \n"
+        "    {%6, %7}, \n"
+        "    {%0, %1}; \n"
+        : "+r"(reg(2)), "+r"(reg(3))
+        : "r"(a.reg(0)), "r"(a.reg(1)), "r"(a.reg(2)), "r"(a.reg(3)), "r"(b.reg(2)), "r"(b.reg(3)));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// BF16 MMA must accumulate with at least FP32
+template <>
+struct Fragment_accumulator<Ampere_hmma_bf16_bf16_traits> : public Fragment<bf16_t, 8> {
+  // Do the HMMA.
+  template <typename Layout_a, typename Layout_b>
+  inline __device__ void mma(Fragment_a<Ampere_hmma_bf16_bf16_traits, Layout_a> const& a,
+                             Fragment_b<Ampere_hmma_bf16_bf16_traits, Layout_b> const& b) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 \n"
+        "    {%0, %1}, \n"
+        "    {%2, %3, %4, %5}, \n"
+        "    {%6, %7}, \n"
+        "    {%0, %1}; \n"
+        : "+r"(reg(0)), "+r"(reg(1))
+        : "r"(a.reg(0)), "r"(a.reg(1)), "r"(a.reg(2)), "r"(a.reg(3)), "r"(b.reg(0)), "r"(b.reg(1)));
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 \n"
+        "    {%0, %1}, \n"
+        "    {%2, %3, %4, %5}, \n"
+        "    {%6, %7}, \n"
+        "    {%0, %1}; \n"
+        : "+r"(reg(2)), "+r"(reg(3))
+        : "r"(a.reg(0)), "r"(a.reg(1)), "r"(a.reg(2)), "r"(a.reg(3)), "r"(b.reg(2)), "r"(b.reg(3)));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_accumulator<Ampere_hmma_fp32_traits> : public Fragment<float, 8> {
+  // The base class.
+  using Base = Fragment<float, 8>;
+
+  // Add two fragments.
+  template <typename Other_fragment_>
+  inline __device__ void add(Other_fragment_ const& other) {
+    for (int ii = 0; ii < Base::NUM_ELTS; ++ii) {
+      this->elt(ii) = this->elt(ii) + other.elt(ii);
+    }
+  }
+
+  inline __device__ void mul(float const other) {
+    for (int ii = 0; ii < Base::NUM_ELTS; ++ii) {
+      this->elt(ii) *= other;
+    }
+  }
+
+  // Do the HMMA.
+  template <typename Layout_a, typename Layout_b>
+  inline __device__ void mma(Fragment_a<Ampere_hmma_fp32_traits, Layout_a> const& a,
+                             Fragment_b<Ampere_hmma_fp32_traits, Layout_b> const& b) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 \n"
+        "    {%0, %1, %2, %3}, \n"
+        "    {%4, %5, %6, %7}, \n"
+        "    {%8, %9}, \n"
+        "    {%0, %1, %2, %3}; \n"
+        : "+f"(elt(0)), "+f"(elt(1)), "+f"(elt(2)), "+f"(elt(3))
+        : "r"(a.reg(0)), "r"(a.reg(1)), "r"(a.reg(2)), "r"(a.reg(3)), "r"(b.reg(0)), "r"(b.reg(1)));
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 \n"
+        "    {%0, %1, %2, %3}, \n"
+        "    {%4, %5, %6, %7}, \n"
+        "    {%8, %9}, \n"
+        "    {%0, %1, %2, %3}; \n"
+        : "+f"(elt(4)), "+f"(elt(5)), "+f"(elt(6)), "+f"(elt(7))
+        : "r"(a.reg(0)), "r"(a.reg(1)), "r"(a.reg(2)), "r"(a.reg(3)), "r"(b.reg(2)), "r"(b.reg(3)));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// BF16 MMA must accumulate with at least FP32
+template <>
+struct Fragment_accumulator<Ampere_hmma_bf16_traits> : public Fragment<float, 8> {
+  // The base class.
+  using Base = Fragment<float, 8>;
+
+  // Add two fragments.
+  template <typename Other_fragment_>
+  inline __device__ void add(Other_fragment_ const& other) {
+    for (int ii = 0; ii < Base::NUM_ELTS; ++ii) {
+      this->elt(ii) = this->elt(ii) + other.elt(ii);
+    }
+  }
+
+  inline __device__ void mul(float const other) {
+    for (int ii = 0; ii < Base::NUM_ELTS; ++ii) {
+      this->elt(ii) *= other;
+    }
+  }
+
+  // Do the HMMA.
+  template <typename Layout_a, typename Layout_b>
+  inline __device__ void mma(Fragment_a<Ampere_hmma_bf16_traits, Layout_a> const& a,
+                             Fragment_b<Ampere_hmma_bf16_traits, Layout_b> const& b) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 \n"
+        "    {%0, %1, %2, %3}, \n"
+        "    {%4, %5, %6, %7}, \n"
+        "    {%8, %9}, \n"
+        "    {%0, %1, %2, %3}; \n"
+        : "+f"(elt(0)), "+f"(elt(1)), "+f"(elt(2)), "+f"(elt(3))
+        : "r"(a.reg(0)), "r"(a.reg(1)), "r"(a.reg(2)), "r"(a.reg(3)), "r"(b.reg(0)), "r"(b.reg(1)));
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 \n"
+        "    {%0, %1, %2, %3}, \n"
+        "    {%4, %5, %6, %7}, \n"
+        "    {%8, %9}, \n"
+        "    {%0, %1, %2, %3}; \n"
+        : "+f"(elt(4)), "+f"(elt(5)), "+f"(elt(6)), "+f"(elt(7))
+        : "r"(a.reg(0)), "r"(a.reg(1)), "r"(a.reg(2)), "r"(a.reg(3)), "r"(b.reg(2)), "r"(b.reg(3)));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_accumulator<Ampere_imma_int8_int32_traits> : public Fragment<int32_t, 8> {
+  // The base class.
+  using Base = Fragment<int32_t, 8>;
+
+  // The fragments.
+  using Fragment_a = Fragment_a<Ampere_imma_int8_int32_traits, Row>;
+  using Fragment_b = Fragment_b<Ampere_imma_int8_int32_traits, Col>;
+
+  // IMMA.
+  inline __device__ void mma(Fragment_a const& a, Fragment_b const& b) {
+#pragma unroll
+    for (int i = 0; i < 2; ++i) {
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 \n"
+          "    {%0, %1, %2, %3}, \n"
+          "    {%4, %5, %6, %7}, \n"
+          "    {%8, %9}, \n"
+          "    {%0, %1, %2, %3}; \n"
+          : "+r"(reg(i * 4 + 0)), "+r"(reg(i * 4 + 1)), "+r"(reg(i * 4 + 2)), "+r"(reg(i * 4 + 3))
+          : "r"(a.reg(0)), "r"(a.reg(1)), "r"(a.reg(2)), "r"(a.reg(3)), "r"(b.reg(i * 2)),
+            "r"(b.reg(i * 2 + 1)));
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_accumulator<Ada_qmma_e4m3_fp32_traits> : public Fragment<float, 8> {
+  // The base class.
+  using Base = Fragment<float, 8>;
+
+  // The fragments.
+  using Fragment_a = Fragment_a<Ada_qmma_e4m3_fp32_traits, Row>;
+  using Fragment_b = Fragment_b<Ada_qmma_e4m3_fp32_traits, Col>;
+
+  // IMMA.
+  inline __device__ void mma(Fragment_a const& a, Fragment_b const& b) {
+#pragma unroll
+    for (int i = 0; i < 2; ++i) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 890
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 \n"
+          "    {%0, %1, %2, %3}, \n"
+          "    {%4, %5, %6, %7}, \n"
+          "    {%8, %9}, \n"
+          "    {%0, %1, %2, %3}; \n"
+          : "+r"(reg(i * 4 + 0)), "+r"(reg(i * 4 + 1)), "+r"(reg(i * 4 + 2)), "+r"(reg(i * 4 + 3))
+          : "r"(a.reg(0)), "r"(a.reg(1)), "r"(a.reg(2)), "r"(a.reg(3)), "r"(b.reg(i * 2)),
+            "r"(b.reg(i * 2 + 1)));
+#else
+      asm volatile("trap;\n");
+#endif
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_accumulator<Ada_qmma_e4m3_fp16_traits> : public Fragment<uint16_t, 8> {
+  // The base class.
+  using Base = Fragment<uint16_t, 8>;
+
+  // The fragments.
+  using Fragment_a = Fragment_a<Ada_qmma_e4m3_fp16_traits, Row>;
+  using Fragment_b = Fragment_b<Ada_qmma_e4m3_fp16_traits, Col>;
+
+  // IMMA.
+  inline __device__ void mma(Fragment_a const& a, Fragment_b const& b) {
+#pragma unroll
+    for (int i = 0; i < 2; ++i) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 890
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f16.e4m3.e4m3.f16 \n"
+          "    {%0, %1}, \n"
+          "    {%2, %3, %4, %5}, \n"
+          "    {%6, %7}, \n"
+          "    {%0, %1}; \n"
+          : "+r"(reg(i * 2 + 0)), "+r"(reg(i * 2 + 1))
+          : "r"(a.reg(0)), "r"(a.reg(1)), "r"(a.reg(2)), "r"(a.reg(3)), "r"(b.reg(i * 2)),
+            "r"(b.reg(i * 2 + 1)));
+#else
+      asm volatile("trap;\n");
+#endif
+    }
+  }
+};
+
+template <typename Traits, typename Cta_tile, bool Sage = false>
+struct Tile_o_normalizer {
+  // The fragment accumulator.
+  using Fragment_accu = Fragment_accumulator<Traits>;
+
+  // The Mma tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of MMAs in M/N dimensions.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  enum { MMAS_N = Mma_tile::VALID_MMAS_N };
+
+  // The number of rows per thread.
+  enum { ROWS_PER_THREAD = 2 * MMAS_M };
+
+  // The number of registers per thread
+  enum { REGS_PER_THREAD = 4 };
+
+  // Warps.
+  enum { WARPS_M = Cta_tile::WARPS_M };
+
+  enum { WARPS_N = Cta_tile::WARPS_N };
+
+  enum { WARPS_K = Cta_tile::WARPS_K };
+
+  // softmax data bytes
+  enum { BYTES_PER_ELEMENT = sizeof(float) };
+
+  // Initialize the attention sinks.
+  template <typename Params, typename Block_info>
+  inline __device__ Tile_o_normalizer(Params const& params, Block_info const& binfo)
+      : attention_sink_value_(params.attention_sinks != nullptr ? params.attention_sinks[binfo.bidh]
+                                                                : -FLT_MAX) {}
+
+  // Update the sum when attention sinks are used.
+  inline __device__ void update_sum(float const (&max)[ROWS_PER_THREAD],
+                                    float (&sum)[ROWS_PER_THREAD]) {
+#pragma unroll
+    for (int i = 0; i < ROWS_PER_THREAD; ++i) {
+      sum[i] += expf(attention_sink_value_ - max[i]);
+    }
+  }
+
+  // Update o.
+  inline __device__ void update(Fragment_accu (&acc_o)[MMAS_M][MMAS_N],
+                                float (&curr_max)[ROWS_PER_THREAD],
+                                float const (&prev_max)[ROWS_PER_THREAD],
+                                float (&sum)[ROWS_PER_THREAD]) {
+#ifdef HALF_ACCUMULATION_FOR_FLASH_ATTENTION  // Half accumulation
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // Precompute the scaling factors for the 2 rows.
+      uint32_t alpha[2];
+#pragma unroll
+      for (int ii = 0; ii < 2; ++ii) {
+        // The row.
+        int jj = 2 * mi + ii;
+        // The multiplier.
+        curr_max[jj] = fmax(prev_max[jj], curr_max[jj]);
+        float a = expf(prev_max[jj] - curr_max[jj]);
+        sum[jj] *= a;
+        // Convert back to FP16x2.
+        alpha[ii] = fmha::float2_to_half2(a, a);
+      }
+
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < REGS_PER_THREAD; ++ii) {
+          // The accumulators in FP16x2.
+          uint32_t acc_o_pair = acc_o[mi][ni].reg(ii);
+
+          // Apply the scaling.
+          acc_o_pair = fmha::hmul2(alpha[ii & 1], acc_o_pair);
+
+          // Update the register.
+          acc_o[mi][ni].reg(ii) = acc_o_pair;
+        }
+      }
+    }
+#else  // Float accumulation
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // Precompute the scaling factors for the 2 rows.
+      float alpha[2];
+#pragma unroll
+      for (int ii = 0; ii < 2; ++ii) {
+        // The row.
+        int jj = 2 * mi + ii;
+        // The multiplier.
+        curr_max[jj] = fmax(prev_max[jj], curr_max[jj]);
+        alpha[ii] = expf(prev_max[jj] - curr_max[jj]);
+        sum[jj] *= alpha[ii];
+      }
+
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < REGS_PER_THREAD; ++ii) {
+          // The registers.
+          float2 acc_o_pair = fmha::half2_to_float2(acc_o[mi][ni].reg(ii));
+
+          // Do the math in Fp32.
+          acc_o_pair.x = alpha[ii & 1] * acc_o_pair.x;
+          acc_o_pair.y = alpha[ii & 1] * acc_o_pair.y;
+
+          // Convert back to Fp16x2.
+          acc_o[mi][ni].reg(ii) = fmha::float2_to_half2(acc_o_pair);
+        }
+      }
+    }
+#endif  // defined HALF_ACCUMULATION_FOR_FLASH_ATTENTION
+  }
+
+  // Update o.
+  inline __device__ void final_update(Fragment_accu (&acc_o)[MMAS_M][MMAS_N],
+                                      float (&sum)[ROWS_PER_THREAD]) {
+#ifdef HALF_ACCUMULATION_FOR_FLASH_ATTENTION  // Half accumulation
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // Precompute the scaling factors for the 2 rows.
+      uint32_t beta[2];
+#pragma unroll
+      for (int ii = 0; ii < 2; ++ii) {
+        // The row.
+        int jj = 2 * mi + ii;
+        float b = (sum[jj] == 0.f || sum[jj] != sum[jj]) ? 1.f : 1.f / sum[jj];
+        // Convert back to FP16x2.
+        beta[ii] = fmha::float2_to_half2(b, b);
+      }
+
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < REGS_PER_THREAD; ++ii) {
+          // The accumulators in FP16x2.
+          uint32_t acc_o_pair = acc_o[mi][ni].reg(ii);
+
+          // Apply the scaling.
+          acc_o_pair = fmha::hmul2(acc_o_pair, beta[ii & 1]);
+
+          // Update the register.
+          acc_o[mi][ni].reg(ii) = acc_o_pair;
+        }
+      }
+    }
+#else  // Float accumulation
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // Precompute the scaling factors for the 2 rows.
+      float beta[2];
+#pragma unroll
+      for (int ii = 0; ii < 2; ++ii) {
+        // The row.
+        int jj = 2 * mi + ii;
+        // The diviser.
+        beta[ii] = (sum[jj] == 0.f || sum[jj] != sum[jj]) ? 1.f : 1.f / sum[jj];
+      }
+
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < REGS_PER_THREAD; ++ii) {
+          // The registers.
+          float2 acc_o_pair = fmha::half2_to_float2(acc_o[mi][ni].reg(ii));
+
+          // Do the math in Fp32.
+          acc_o_pair.x = acc_o_pair.x * beta[ii & 1];
+          acc_o_pair.y = acc_o_pair.y * beta[ii & 1];
+
+          // Convert back to Fp16x2.
+          acc_o[mi][ni].reg(ii) = fmha::float2_to_half2(acc_o_pair);
+        }
+      }
+    }
+#endif  // defined HALF_ACCUMULATION_FOR_FLASH_ATTENTION
+  }
+
+  // Attention sink value.
+  float attention_sink_value_;
+};
+
+template <typename Traits, typename Cta_tile>
+struct Tile_o_normalizer_fp32 {
+  // The fragment accumulator.
+  using Fragment_accu = Fragment_accumulator<Traits>;
+
+  // The Mma tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of MMAs in the M dimension.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  // The number of MMAs in the N dimension.
+  enum { MMAS_N = Mma_tile::VALID_MMAS_N };
+
+  // The number of rows per thread.
+  enum { ROWS_PER_THREAD = 2 * MMAS_M };
+
+  // The number of registers per thread.
+  enum { REGS_PER_THREAD = 8 };
+
+  // Warps.
+  enum { WARPS_M = Cta_tile::WARPS_M };
+
+  enum { WARPS_N = Cta_tile::WARPS_N };
+
+  enum { WARPS_K = Cta_tile::WARPS_K };
+
+  // softmax data bytes
+  enum { BYTES_PER_ELEMENT = sizeof(float) };
+
+  // Initialize the attention sinks.
+  template <typename Params, typename Block_info>
+  inline __device__ Tile_o_normalizer_fp32(Params const& params, Block_info const& binfo)
+      : attention_sink_value_(params.attention_sinks != nullptr ? params.attention_sinks[binfo.bidh]
+                                                                : -FLT_MAX) {}
+
+  // Update the sum when attention sinks are used.
+  inline __device__ void update_sum(float const (&max)[ROWS_PER_THREAD],
+                                    float (&sum)[ROWS_PER_THREAD]) {
+#pragma unroll
+    for (int i = 0; i < ROWS_PER_THREAD; ++i) {
+      sum[i] += expf(attention_sink_value_ - max[i]);
+    }
+  }
+
+  // Update o.
+  inline __device__ void update(Fragment_accu (&acc_o)[MMAS_M][MMAS_N],
+                                float (&curr_max)[ROWS_PER_THREAD],
+                                float const (&prev_max)[ROWS_PER_THREAD],
+                                float (&sum)[ROWS_PER_THREAD]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // Precompute the scaling factors for the 2 rows.
+      float alpha[2];
+#pragma unroll
+      for (int ii = 0; ii < 2; ++ii) {
+        // The row.
+        int jj = 2 * mi + ii;
+        // The multiplier.
+        curr_max[jj] = fmax(prev_max[jj], curr_max[jj]);
+        alpha[ii] = expf(prev_max[jj] - curr_max[jj]);
+        sum[jj] *= alpha[ii];
+      }
+
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < REGS_PER_THREAD; ++ii) {
+          // The register     for O.
+          float acc_o_f = acc_o[mi][ni].elt(ii);
+          // Compute the next accumulator.
+          acc_o_f = alpha[(ii & 2) / 2] * acc_o_f;
+          // Update the accumulator.
+          acc_o[mi][ni].elt(ii) = acc_o_f;
+        }
+      }
+    }
+  }
+
+  // Update o after P * V
+  inline __device__ void final_update(Fragment_accu (&acc_o)[MMAS_M][MMAS_N],
+                                      float (&sum)[ROWS_PER_THREAD]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // Precompute the scaling factors for the 2 rows.
+      float beta[2];
+#pragma unroll
+      for (int ii = 0; ii < 2; ++ii) {
+        // The row.
+        int jj = 2 * mi + ii;
+
+        // The diviser.
+        beta[ii] = (sum[jj] == 0.f || sum[jj] != sum[jj]) ? 1.f : 1.f / sum[jj];
+      }
+
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < REGS_PER_THREAD; ++ii) {
+          // The register for O.
+          float acc_o_f = acc_o[mi][ni].elt(ii);
+          // Compute the next accumulator.
+          acc_o_f = acc_o_f * beta[(ii & 2) / 2];
+          // Update the accumulator.
+          acc_o[mi][ni].elt(ii) = acc_o_f;
+        }
+      }
+    }
+  }
+
+  // Attention sink value.
+  float attention_sink_value_;
+};
+
+template <typename Cta_tile>
+struct Tile_o_normalizer<Ampere_hmma_fp32_traits, Cta_tile>
+    : public Tile_o_normalizer_fp32<Ampere_hmma_fp32_traits, Cta_tile> {
+  // The traits.
+  using Traits = fmha::Ampere_hmma_fp32_traits;
+  // The base class.
+  using Base = Tile_o_normalizer_fp32<Traits, Cta_tile>;
+
+  // The ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Tile_o_normalizer(Params const& params, Block_info const& binfo)
+      : Base(params, binfo) {}
+};
+
+template <typename Cta_tile>
+struct Tile_o_normalizer<Ampere_hmma_bf16_traits, Cta_tile>
+    : public Tile_o_normalizer_fp32<Ampere_hmma_bf16_traits, Cta_tile> {
+  // The traits.
+  using Traits = fmha::Ampere_hmma_bf16_traits;
+  // The base class.
+  using Base = Tile_o_normalizer_fp32<Traits, Cta_tile>;
+
+  // The ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Tile_o_normalizer(Params const& params, Block_info const& binfo)
+      : Base(params, binfo) {}
+};
+
+// The attention sinks are not enabled for Volta.
+template <typename Cta_tile>
+struct Tile_o_normalizer<Volta_hmma_fp16_16x16x16_traits, Cta_tile> {
+  // The traits.
+  using Traits = Volta_hmma_fp16_16x16x16_traits;
+
+  // The fragments.
+  using Fragment_accu = Fragment_accumulator<Traits>;
+
+  // The Mma tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of MMAs in M/N dimensions.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  enum { MMAS_N = Mma_tile::VALID_MMAS_N };
+
+  // The number of rows per thread.
+  enum { ROWS_PER_THREAD = MMAS_M };
+
+  // The number of registers per thread
+  enum { REGS_PER_THREAD = 8 };
+
+  // Update o.
+  inline __device__ void update(Fragment_accu (&acc_o)[MMAS_M][MMAS_N],
+                                float (&curr_max)[ROWS_PER_THREAD],
+                                float const (&prev_max)[ROWS_PER_THREAD],
+                                float (&sum)[ROWS_PER_THREAD]) {
+#ifdef HALF_ACCUMULATION_FOR_FLASH_ATTENTION  // Half accumulation
+
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // Precompute the scaling factors.
+      uint32_t alpha;
+      // Update the curr_max.
+      curr_max[mi] = fmax(prev_max[mi], curr_max[mi]);
+      // The multiplier.
+      float a = expf(prev_max[mi] - curr_max[mi]);
+      // The accumulated sum.
+      sum[mi] *= a;
+      // Convert back to FP16.
+      alpha = fmha::float2_to_half2(a, a);
+
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < REGS_PER_THREAD; ++ii) {
+          // The accumulators packed in FP16x2.
+          uint32_t acc_o_pair = acc_o[mi][ni].reg(ii);
+
+          // Apply the scaling.
+          acc_o_pair = fmha::hmul2(acc_o_pair, alpha);
+
+          // Update the register.
+          acc_o[mi][ni].reg(ii) = acc_o_pair;
+        }
+      }
+    }
+#else  // Float accumulation
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // Update the curr_max.
+      curr_max[mi] = fmax(prev_max[mi], curr_max[mi]);
+      // The multiplier.
+      float alpha = expf(prev_max[mi] - curr_max[mi]);
+      // The accumulated sum.
+      sum[mi] *= alpha;
+
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < REGS_PER_THREAD; ++ii) {
+          // The accumulators. Convert from FP16x2 to FP32x2.
+          float2 acc_o_pair = fmha::half2_to_float2(acc_o[mi][ni].reg(ii));
+
+          // Apply the scaling.
+          acc_o_pair.x = alpha * acc_o_pair.x;
+          acc_o_pair.y = alpha * acc_o_pair.y;
+
+          // Update the register after converting back to FP16x2.
+          acc_o[mi][ni].reg(ii) = fmha::float2_to_half2(acc_o_pair);
+        }
+      }
+    }
+#endif  // defined HALF_ACCUMULATION_FOR_FLASH_ATTENTION
+  }
+
+  // Update o.
+  inline __device__ void final_update(Fragment_accu (&acc_o)[MMAS_M][MMAS_N],
+                                      float const (&sum)[ROWS_PER_THREAD]) {
+#ifdef HALF_ACCUMULATION_FOR_FLASH_ATTENTION  // Half accumulation
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // Precompute the scaling factors.
+      uint32_t beta;
+      // The divisor.
+      float b = (sum[mi] == 0.f || sum[mi] != sum[mi]) ? 1.f : 1.f / sum[mi];
+      // Convert back to FP16.
+      beta = fmha::float2_to_half2(b, b);
+
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < REGS_PER_THREAD; ++ii) {
+          // The accumulators packed in FP16x2.
+          uint32_t acc_o_pair = acc_o[mi][ni].reg(ii);
+
+          // Apply the scaling.
+          acc_o_pair = fmha::hmul2(acc_o_pair, beta);
+
+          // Update the register.
+          acc_o[mi][ni].reg(ii) = acc_o_pair;
+        }
+      }
+    }
+#else  // Float accumulation
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // The divisor.
+      float beta = (sum[mi] == 0.f || sum[mi] != sum[mi]) ? 1.f : 1.f / sum[mi];
+
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < REGS_PER_THREAD; ++ii) {
+          // The registers.
+          float2 acc_o_pair = fmha::half2_to_float2(acc_o[mi][ni].reg(ii));
+
+          // Do the math in Fp32.
+          acc_o_pair.x = acc_o_pair.x * beta;
+          acc_o_pair.y = acc_o_pair.y * beta;
+
+          // Convert back to Fp16x2.
+          acc_o[mi][ni].reg(ii) = fmha::float2_to_half2(acc_o_pair);
+        }
+      }
+    }
+#endif  // defined HALF_ACCUMULATION_FOR_FLASH_ATTENTION
+  }
+};
+
+template <typename Cta_tile>
+struct Tile_o_normalizer<Ada_qmma_e4m3_fp32_traits, Cta_tile>
+    : public Tile_o_normalizer_fp32<Ada_qmma_e4m3_fp32_traits, Cta_tile> {
+  // The traits.
+  using Traits = fmha::Ada_qmma_e4m3_fp32_traits;
+  // The base class.
+  using Base = Tile_o_normalizer_fp32<Traits, Cta_tile>;
+
+  // The ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Tile_o_normalizer(Params const& params, Block_info const& binfo)
+      : Base(params, binfo) {}
+
+  // Update the sum.
+  inline __device__ void update_sum(float const (&max)[Base::ROWS_PER_THREAD],
+                                    float (&sum)[Base::ROWS_PER_THREAD]) {
+// Take the log2f(Traits::SOFTMAX_FP_QUANT_SCALE) into account as the same scale has been applied to
+// sum.
+#pragma unroll
+    for (int i = 0; i < Base::ROWS_PER_THREAD; ++i) {
+      sum[i] += expf(this->attention_sink_value_ - max[i]) * Traits::SOFTMAX_FP_QUANT_SCALE;
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Tile_o_normalizer<Ada_qmma_e4m3_fp32_traits, Cta_tile, true>
+    : public Tile_o_normalizer_fp32<Ada_qmma_e4m3_fp32_traits, Cta_tile> {
+  // The traits.
+  using Traits = fmha::Ada_qmma_e4m3_fp32_traits;
+  // The base class.
+  using Base = Tile_o_normalizer_fp32<Traits, Cta_tile>;
+
+  using Fragment_accu = Fragment_accumulator<Traits>;
+
+  // The Mma tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of MMAs in the M dimension.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  // The number of MMAs in the N dimension.
+  enum { MMAS_N = Mma_tile::VALID_MMAS_N };
+
+  // The number of registers per thread.
+  enum { REGS_PER_THREAD = 8 };
+
+  // The ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Tile_o_normalizer(Params const& params, Block_info const& binfo)
+      : Base(params, binfo) {}
+
+  inline __device__ void merge(Fragment_accu (&acc_dst)[MMAS_M][MMAS_N],
+                               Fragment_accu (&acc_src)[MMAS_M][MMAS_N]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < REGS_PER_THREAD; ++ii) {
+          acc_dst[mi][ni].elt(ii) += acc_src[mi][ni].elt(ii);
+        }
+      }
+    }
+  }
+
+  template <typename Params>
+  inline __device__ void move_to_first_block(Params const& params, int bidb, int bidh) {
+    int scale_iter = bidb * params.h * params.sage.v.max_nblock + bidh * params.sage.v.max_nblock;
+
+    params_scale_v_iter = reinterpret_cast<float const*>(params.sage.v.scales + scale_iter);
+    params_scale_v_ = __ldg(params_scale_v_iter);
+  }
+
+  inline __device__ void move_to_next_block() {
+    params_scale_v_iter += 1;
+    params_scale_v_ = __ldg(params_scale_v_iter);
+  }
+
+  inline __device__ void apply_scale(Fragment_accu (&acc_o)[MMAS_M][MMAS_N]) {
+    float const scale = reinterpret_cast<float const&>(params_scale_v_);
+
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < REGS_PER_THREAD; ++ii) {
+          float acc_o_f = acc_o[mi][ni].elt(ii);
+          acc_o_f = scale * acc_o_f;
+          acc_o[mi][ni].elt(ii) = acc_o_f;
+        }
+      }
+    }
+  }
+
+  float const* params_scale_v_iter;
+  float params_scale_v_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Mma_tile>
+struct Softmax_saver {
+  // The number of MMAs in M/N dimensions.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  enum { MMAS_N = Mma_tile::VALID_MMAS_N };
+
+  // The number of rows per thread.
+  enum { ROWS_PER_THREAD = 2 * MMAS_M };
+
+  // The number of registers per thread
+  enum { REGS_PER_THREAD = 4 };
+
+  // Warps.
+  enum { WARPS_M = Cta_tile::WARPS_M };
+
+  enum { WARPS_N = Cta_tile::WARPS_N };
+
+  enum { WARPS_K = Cta_tile::WARPS_K };
+
+  // softmax data bytes
+  enum { BYTES_PER_ELEMENT = sizeof(float) };
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Softmax_saver(Params const& params, Block_info const& binfo)
+      : actual_q_len_(binfo.actual_q_seqlen),
+        softmax_sum_ptr_(reinterpret_cast<char*>(params.softmax_stats_ptr)),
+        softmax_stats_stride_in_bytes_(params.softmax_stats_stride_in_bytes) {
+    softmax_max_ptr_ = reinterpret_cast<char*>(params.softmax_stats_ptr);
+
+    int warp = threadIdx.x / Cta_tile::THREADS_PER_WARP;
+    int lane = threadIdx.x % Cta_tile::THREADS_PER_WARP;
+    // MMA row0 index (8x4 thread layout)
+
+    int m_per_mma = 32 / Mma_tile::THREADS_PER_MMA_N * 2;
+    row0_ = (warp % WARPS_M) * m_per_mma + (lane / 4);
+    // Decide whether to store the lse values
+    store_softmax_ = (lane % 4 == 0 && int(warp / WARPS_M) == 0);
+
+    // assume fixed seq length for the batch
+    size_t const bh_offset = (binfo.sum_s * params.h + binfo.bidh) * sizeof(float) * 2;
+    softmax_max_ptr_ += bh_offset + row0_ * params.softmax_stats_stride_in_bytes;
+    softmax_sum_ptr_ += bh_offset + row0_ * params.softmax_stats_stride_in_bytes + sizeof(float);
+  };
+
+  inline __device__ void store(int q_loop, float* p_sum, float* p_max) {
+    if (store_softmax_) {
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M; ++mi) {
+        float sum0 = p_sum[mi * 2];
+        float sum1 = p_sum[mi * 2 + 1];
+        float max0 = p_max[mi * 2];
+        float max1 = p_max[mi * 2 + 1];
+
+        int row_offset = q_loop * Cta_tile::M + mi * Mma_tile::M_PER_MMA_PER_CTA;
+        if (row0_ + row_offset < actual_q_len_) {
+          fmha::stg(softmax_max_ptr_ + row_offset * softmax_stats_stride_in_bytes_, max0);
+          fmha::stg(softmax_sum_ptr_ + row_offset * softmax_stats_stride_in_bytes_, sum0);
+        }
+        if (row0_ + row_offset + 8 < actual_q_len_) {
+          fmha::stg(softmax_max_ptr_ + (row_offset + 8) * softmax_stats_stride_in_bytes_, max1);
+          fmha::stg(softmax_sum_ptr_ + (row_offset + 8) * softmax_stats_stride_in_bytes_, sum1);
+        }
+      }
+    }
+  }
+
+  // ptr (total_token_q, h, 2) float
+  char* softmax_sum_ptr_ = nullptr;
+  char* softmax_max_ptr_ = nullptr;
+
+  // the first row's idx
+  int row0_;
+  // actual seq length
+  int const actual_q_len_ = 0;
+  int const softmax_stats_stride_in_bytes_ = 0;
+
+  // store lse or not
+  bool store_softmax_ = false;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Flash Attention: default applied to Turing, Ampere fp16 traits
+
+template <typename Traits, typename Cta_tile>
+struct Fragment_updater {
+  // The fragment accumulator.
+  using Fragment_accu = Fragment_accumulator<Traits>;
+
+  // The Mma tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of MMAs in M/N dimensions.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  enum { MMAS_N = Mma_tile::VALID_MMAS_N };
+
+  // The number of rows per thread.
+  enum { ROWS_PER_THREAD = 2 * MMAS_M };
+
+  // The number of registers per thread
+  enum { REGS_PER_THREAD = 4 };
+
+  // Warps.
+  enum { WARPS_M = Cta_tile::WARPS_M };
+
+  enum { WARPS_N = Cta_tile::WARPS_N };
+
+  enum { WARPS_K = Cta_tile::WARPS_K };
+
+  // softmax data bytes
+  enum { BYTES_PER_ELEMENT = sizeof(float) };
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Fragment_updater(Params const& params, Block_info const& binfo)
+      : actual_seqlen_(binfo.actual_seqlen),
+        softmax_lse_ptr_(reinterpret_cast<char*>(params.lse_ptr))  // [b, h, s]
+  {
+    int warp = threadIdx.x / Cta_tile::THREADS_PER_WARP;
+    int lane = threadIdx.x % Cta_tile::THREADS_PER_WARP;
+    // MMA row0 index (8x4 thread layout)
+    row0_ = (warp % WARPS_M) * Mma_tile::M_PER_MMA + (lane / 4);
+    // Decide whether to store the lse values
+    store_lse_ = (lane % 4 == 0 && int(warp / WARPS_M) == 0);
+
+    // assume fixed seq length for the batch
+    size_t const bh_offset =
+        (binfo.bidb * params.h + binfo.bidh) * binfo.actual_seqlen * BYTES_PER_ELEMENT;
+    softmax_lse_ptr_ += bh_offset + row0_ * BYTES_PER_ELEMENT;
+  };
+
+  // init all statistics
+  inline __device__ Fragment_updater() {
+#pragma unroll
+    for (int row_i = 0; row_i < ROWS_PER_THREAD; ++row_i) {
+      curr_max_[row_i] = -HUGE_VALF;
+      prev_max_[row_i] = -HUGE_VALF;
+      prev_sum_[row_i] = 0.0f;
+      curr_sum_[row_i] = 0.0f;
+    }
+  }
+
+  // Update o.
+  inline __device__ void update_o(Fragment_accu (&acc_o)[MMAS_M][MMAS_N],
+                                  Fragment_accu const (&local_acc_o)[MMAS_M][MMAS_N]) {
+#ifdef HALF_ACCUMULATION_FOR_FLASH_ATTENTION  // Half accumulation
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // Precompute the scaling factors for the 2 rows.
+      uint32_t alpha[2], beta[2];
+#pragma unroll
+      for (int ii = 0; ii < 2; ++ii) {
+        // The row.
+        int jj = 2 * mi + ii;
+        // The multiplier.
+        float a = prev_sum_[jj] * __expf(prev_max_[jj] - curr_max_[jj]);
+        // The diviser.
+        float b =
+            (curr_sum_[jj] == 0.f || curr_sum_[jj] != curr_sum_[jj]) ? 1.f : 1.f / curr_sum_[jj];
+        // Convert back to FP16x2.
+        alpha[ii] = fmha::float2_to_half2(a, a);
+        beta[ii] = fmha::float2_to_half2(b, b);
+      }
+
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < REGS_PER_THREAD; ++ii) {
+          // The accumulators in FP16x2.
+          uint32_t local_o_pair = local_acc_o[mi][ni].reg(ii);
+          uint32_t acc_o_pair = acc_o[mi][ni].reg(ii);
+
+          // Apply the scaling.
+          acc_o_pair = fmha::hfma2(alpha[ii & 1], acc_o_pair, local_o_pair);
+          acc_o_pair = fmha::hmul2(acc_o_pair, beta[ii & 1]);
+
+          // Update the register.
+          acc_o[mi][ni].reg(ii) = acc_o_pair;
+        }
+      }
+    }
+#else  // Float accumulation
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // Precompute the scaling factors for the 2 rows.
+      float alpha[2], beta[2];
+#pragma unroll
+      for (int ii = 0; ii < 2; ++ii) {
+        // The row.
+        int jj = 2 * mi + ii;
+        // The multiplier.
+        alpha[ii] = prev_sum_[jj] * __expf(prev_max_[jj] - curr_max_[jj]);
+        // The diviser.
+        beta[ii] =
+            (curr_sum_[jj] == 0.f || curr_sum_[jj] != curr_sum_[jj]) ? 1.f : 1.f / curr_sum_[jj];
+      }
+
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < REGS_PER_THREAD; ++ii) {
+          // The registers.
+          float2 local_o_pair = fmha::half2_to_float2(local_acc_o[mi][ni].reg(ii));
+          float2 acc_o_pair = fmha::half2_to_float2(acc_o[mi][ni].reg(ii));
+
+          // Do the math in Fp32.
+          acc_o_pair.x = (alpha[ii & 1] * acc_o_pair.x + local_o_pair.x) * beta[ii & 1];
+          acc_o_pair.y = (alpha[ii & 1] * acc_o_pair.y + local_o_pair.y) * beta[ii & 1];
+
+          // Convert back to Fp16x2.
+          acc_o[mi][ni].reg(ii) = fmha::float2_to_half2(acc_o_pair);
+        }
+      }
+    }
+#endif  // defined HALF_ACCUMULATION_FOR_FLASH_ATTENTION
+  }
+
+  // Update max scale
+  inline __device__ void update_acc_max() {
+#pragma unroll
+    for (int row_i = 0; row_i < ROWS_PER_THREAD; ++row_i) {
+      float pre_curr_max_ = curr_max_[row_i];
+      curr_max_[row_i] = fmaxf(prev_max_[row_i], curr_max_[row_i]);
+      prev_max_[row_i] = pre_curr_max_;
+    }
+  }
+
+  // Update max scale
+  inline __device__ void update_acc_sum() {
+#pragma unroll
+    for (int row_i = 0; row_i < ROWS_PER_THREAD; ++row_i) {
+      float pre_curr_sum_ = curr_sum_[row_i];
+      curr_sum_[row_i] =
+          __expf(prev_max_[row_i] - curr_max_[row_i]) * curr_sum_[row_i] + prev_sum_[row_i];
+      prev_sum_[row_i] = pre_curr_sum_;
+    }
+  }
+
+  inline __device__ void store(int q_loop) {
+    if (store_lse_) {
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M; ++mi) {
+        float row0_lse = curr_max_[mi * 2] + __logf(curr_sum_[mi * 2]);
+        float row1_lse = curr_max_[mi * 2 + 1] + __logf(curr_sum_[mi * 2 + 1]);
+        int row_offset = q_loop * Cta_tile::M + mi * Mma_tile::M_PER_MMA_PER_CTA;
+        if (row0_ + row_offset < actual_seqlen_) {
+          fmha::stg(softmax_lse_ptr_ + row_offset * BYTES_PER_ELEMENT, row0_lse);
+        }
+        if (row0_ + row_offset + 8 < actual_seqlen_) {
+          fmha::stg(softmax_lse_ptr_ + (row_offset + 8) * BYTES_PER_ELEMENT, row1_lse);
+        }
+      }
+    }
+  }
+
+  // Update scales.
+  float curr_max_[ROWS_PER_THREAD] = {-HUGE_VALF};
+  float curr_sum_[ROWS_PER_THREAD] = {0};
+  float prev_max_[ROWS_PER_THREAD] = {-HUGE_VALF};
+  ;
+  float prev_sum_[ROWS_PER_THREAD] = {0};
+
+  // ptr
+  char* softmax_lse_ptr_ = nullptr;
+
+  // the first row's idx
+  int row0_ = 0;
+  // actual seq length
+  int const actual_seqlen_ = 0;
+
+  // store lse or not
+  bool store_lse_ = false;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Flash attention to update the accumulators in the 2nd GEMM when we accumulate in FP32.
+// Support both hmma_fp32 and ampere_hmma_bf16
+template <typename Traits, typename Cta_tile>
+struct Fragment_updater_ampere_fp32 {
+  // The fragment accumulator.
+  using Fragment_accu = Fragment_accumulator<Traits>;
+
+  // The Mma tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of MMAs in the M dimension.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  // The number of MMAs in the N dimension.
+  enum { MMAS_N = Mma_tile::VALID_MMAS_N };
+
+  // The number of rows per thread.
+  enum { ROWS_PER_THREAD = 2 * MMAS_M };
+
+  // The number of registers per thread.
+  enum { REGS_PER_THREAD = 8 };
+
+  // Warps.
+  enum { WARPS_M = Cta_tile::WARPS_M };
+
+  enum { WARPS_N = Cta_tile::WARPS_N };
+
+  enum { WARPS_K = Cta_tile::WARPS_K };
+
+  // softmax data bytes
+  enum { BYTES_PER_ELEMENT = sizeof(float) };
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Fragment_updater_ampere_fp32(Params const& params, Block_info const& binfo)
+      : actual_seqlen_(binfo.actual_seqlen),
+        softmax_lse_ptr_(reinterpret_cast<char*>(params.lse_ptr))  // [b, h, s]
+  {
+    int warp = threadIdx.x / Cta_tile::THREADS_PER_WARP;
+    int lane = threadIdx.x % Cta_tile::THREADS_PER_WARP;
+    // MMA row0 index (8x4 thread layout)
+    row0_ = (warp % WARPS_M) * Mma_tile::M_PER_MMA + (lane / 4);
+    // Decide whether to store the lse values
+    store_lse_ = (lane % 4 == 0 && int(warp / WARPS_M) == 0);
+
+    // assume fixed seq length for the batch
+    size_t const bh_offset =
+        (binfo.bidb * params.h + binfo.bidh) * binfo.actual_seqlen * BYTES_PER_ELEMENT;
+    softmax_lse_ptr_ += bh_offset + row0_ * BYTES_PER_ELEMENT;
+  };
+
+  // init all statistics
+  inline __device__ Fragment_updater_ampere_fp32() {
+#pragma unroll
+    for (int row_i = 0; row_i < ROWS_PER_THREAD; ++row_i) {
+      curr_max_[row_i] = -HUGE_VALF;
+      prev_max_[row_i] = -HUGE_VALF;
+      prev_sum_[row_i] = 0.0f;
+      curr_sum_[row_i] = 0.0f;
+    }
+  }
+
+  // Update o after P * V
+  inline __device__ void update_o(Fragment_accu (&acc_o)[MMAS_M][MMAS_N],
+                                  Fragment_accu const (&local_acc_o)[MMAS_M][MMAS_N]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // Precompute the scaling factors for the 2 rows.
+      float alpha[2], beta[2];
+#pragma unroll
+      for (int ii = 0; ii < 2; ++ii) {
+        // The row.
+        int jj = 2 * mi + ii;
+        // The multiplier.
+        alpha[ii] = prev_sum_[jj] * __expf(prev_max_[jj] - curr_max_[jj]);
+        // The diviser.
+        beta[ii] =
+            (curr_sum_[jj] == 0.f || curr_sum_[jj] != curr_sum_[jj]) ? 1.f : 1.f / curr_sum_[jj];
+      }
+
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < REGS_PER_THREAD; ++ii) {
+          // The register from P.
+          float local_acc_o_f = local_acc_o[mi][ni].elt(ii);
+          // The register for O.
+          float acc_o_f = acc_o[mi][ni].elt(ii);
+          // Compute the next accumulator.
+          acc_o_f = (alpha[(ii & 2) / 2] * acc_o_f + local_acc_o_f) * beta[(ii & 2) / 2];
+          // Update the accumulator.
+          acc_o[mi][ni].elt(ii) = acc_o_f;
+        }
+      }
+    }
+  }
+
+  // Update o before P * V
+  inline __device__ void update_o(Fragment_accu (&acc_o)[MMAS_M][MMAS_N]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // Precompute the scaling factors for the 2 rows.
+      float alpha[2], beta[2];
+#pragma unroll
+      for (int ii = 0; ii < 2; ++ii) {
+        // The row.
+        int jj = 2 * mi + ii;
+        // The multiplier.
+        alpha[ii] = prev_sum_[jj] * __expf(prev_max_[jj] - curr_max_[jj]);
+        // The diviser.
+        beta[ii] =
+            (curr_sum_[jj] == 0.f || curr_sum_[jj] != curr_sum_[jj]) ? 1.f : 1.f / curr_sum_[jj];
+      }
+
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < REGS_PER_THREAD; ++ii) {
+          // The register for O.
+          float acc_o_f = acc_o[mi][ni].elt(ii);
+          // Compute the next accumulator.
+          acc_o_f = alpha[(ii & 2) / 2] * acc_o_f * beta[(ii & 2) / 2];
+          // Update the accumulator.
+          acc_o[mi][ni].elt(ii) = acc_o_f;
+        }
+      }
+    }
+  }
+
+  // Update max scale
+  inline __device__ void update_acc_max() {
+#pragma unroll
+    for (int ii = 0; ii < ROWS_PER_THREAD; ++ii) {
+      float curr_max = curr_max_[ii];
+      curr_max_[ii] = fmaxf(prev_max_[ii], curr_max);
+      prev_max_[ii] = curr_max;
+    }
+  }
+
+  // Update max scale
+  inline __device__ void update_acc_sum() {
+#pragma unroll
+    for (int ii = 0; ii < ROWS_PER_THREAD; ++ii) {
+      float curr_sum = curr_sum_[ii];
+      curr_sum_[ii] = __expf(prev_max_[ii] - curr_max_[ii]) * curr_sum_[ii] + prev_sum_[ii];
+      prev_sum_[ii] = curr_sum;
+    }
+  }
+
+  inline __device__ void store(int q_loop) {
+    if (store_lse_) {
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M; ++mi) {
+        float row0_lse = curr_max_[mi * 2] + __logf(curr_sum_[mi * 2]);
+        float row1_lse = curr_max_[mi * 2 + 1] + __logf(curr_sum_[mi * 2 + 1]);
+        int row_offset = q_loop * Cta_tile::M + mi * Mma_tile::M_PER_MMA_PER_CTA;
+        if (row0_ + row_offset < actual_seqlen_) {
+          fmha::stg(softmax_lse_ptr_ + row_offset * BYTES_PER_ELEMENT, row0_lse);
+        }
+        if (row0_ + row_offset + 8 < actual_seqlen_) {
+          fmha::stg(softmax_lse_ptr_ + (row_offset + 8) * BYTES_PER_ELEMENT, row1_lse);
+        }
+      }
+    }
+  }
+
+  // Update scales.
+  float curr_max_[ROWS_PER_THREAD] = {-HUGE_VALF};
+  float curr_sum_[ROWS_PER_THREAD] = {0};
+  float prev_max_[ROWS_PER_THREAD] = {-HUGE_VALF};
+  float prev_sum_[ROWS_PER_THREAD] = {0};
+
+  // ptr
+  char* softmax_lse_ptr_ = nullptr;
+
+  // the first row's idx
+  int row0_ = 0;
+  // actual seq length
+  int const actual_seqlen_ = 0;
+
+  // store lse or not
+  bool store_lse_ = false;
+};
+
+template <typename Cta_tile>
+struct Fragment_updater<Ampere_hmma_fp32_traits, Cta_tile>
+    : public Fragment_updater_ampere_fp32<Ampere_hmma_fp32_traits, Cta_tile> {
+  // The traits.
+  using Traits = fmha::Ampere_hmma_fp32_traits;
+  // The base class.
+  using Base = Fragment_updater_ampere_fp32<Traits, Cta_tile>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Fragment_updater(Params const& params, Block_info const& binfo)
+      : Base(params, binfo) {}
+
+  // Default ctor
+  Fragment_updater() = default;
+};
+
+template <typename Cta_tile>
+struct Fragment_updater<Ampere_hmma_bf16_traits, Cta_tile>
+    : public Fragment_updater_ampere_fp32<Ampere_hmma_bf16_traits, Cta_tile> {
+  // The traits.
+  using Traits = fmha::Ampere_hmma_bf16_traits;
+  // The base class.
+  using Base = Fragment_updater_ampere_fp32<Traits, Cta_tile>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Fragment_updater(Params const& params, Block_info const& binfo)
+      : Base(params, binfo) {}
+
+  // Default ctor
+  Fragment_updater() = default;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Fragment_updater<Turing_hmma_fp32_traits, Cta_tile>
+    : public Fragment_updater_ampere_fp32<Turing_hmma_fp32_traits, Cta_tile> {
+  // The traits.
+  using Traits = fmha::Turing_hmma_fp32_traits;
+  // The base class.
+  using Base = Fragment_updater_ampere_fp32<Traits, Cta_tile>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Fragment_updater(Params const& params, Block_info const& binfo)
+      : Base(params, binfo) {}
+
+  // Default ctor
+  Fragment_updater() = default;
+};
+
+template <typename Cta_tile>
+struct Fragment_updater<Ada_qmma_e4m3_fp32_traits, Cta_tile>
+    : public Fragment_updater_ampere_fp32<Ada_qmma_e4m3_fp32_traits, Cta_tile> {
+  // The traits.
+  using Traits = fmha::Ada_qmma_e4m3_fp32_traits;
+  // The base class.
+  using Base = Fragment_updater_ampere_fp32<Traits, Cta_tile>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Fragment_updater(Params const& params, Block_info const& binfo)
+      : Base(params, binfo) {}
+
+  // Default ctor
+  Fragment_updater() = default;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Fragment_updater<Volta_hmma_fp16_16x16x16_traits, Cta_tile> {
+  // The traits.
+  using Traits = Volta_hmma_fp16_16x16x16_traits;
+
+  // The fragments.
+  using Fragment_accu = Fragment_accumulator<Traits>;
+
+  // The Mma tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of MMAs in M/N dimensions.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  enum { MMAS_N = Mma_tile::VALID_MMAS_N };
+
+  // The number of rows per thread.
+  enum { ROWS_PER_THREAD = MMAS_M };
+
+  // The number of registers per thread
+  enum { REGS_PER_THREAD = 8 };
+
+  // init all statistics
+  inline __device__ Fragment_updater() {
+#pragma unroll
+    for (int row_i = 0; row_i < ROWS_PER_THREAD; ++row_i) {
+      curr_max_[row_i] = -HUGE_VALF;
+      prev_max_[row_i] = -HUGE_VALF;
+      prev_sum_[row_i] = 0.0f;
+      curr_sum_[row_i] = 0.0f;
+    }
+  }
+
+  // Update o.
+  inline __device__ void update_o(Fragment_accu (&acc_o)[MMAS_M][MMAS_N],
+                                  Fragment_accu const (&local_acc_o)[MMAS_M][MMAS_N]) {
+#ifdef HALF_ACCUMULATION_FOR_FLASH_ATTENTION  // Half accumulation
+
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // Precompute the scaling factors.
+      uint32_t alpha, beta;
+      // The multiplier.
+      float a = prev_sum_[mi] * __expf(prev_max_[mi] - curr_max_[mi]);
+      // The diviser.
+      float b =
+          (curr_sum_[mi] == 0.f || curr_sum_[mi] != curr_sum_[mi]) ? 1.f : 1.f / curr_sum_[mi];
+      // Convert back to FP16.
+      alpha = fmha::float2_to_half2(a, a);
+      beta = fmha::float2_to_half2(b, b);
+
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < REGS_PER_THREAD; ++ii) {
+          // The accumulators packed in FP16x2.
+          uint32_t local_o_pair = local_acc_o[mi][ni].reg(ii);
+          uint32_t acc_o_pair = acc_o[mi][ni].reg(ii);
+
+          // Apply the scaling.
+          acc_o_pair = fmha::hmul2(fmha::hfma2(alpha, acc_o_pair, local_o_pair), beta);
+
+          // Update the register.
+          acc_o[mi][ni].reg(ii) = acc_o_pair;
+        }
+      }
+    }
+#else  // Float accumulation
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // The multiplier.
+      float alpha = prev_sum_[mi] * __expf(prev_max_[mi] - curr_max_[mi]);
+      // The diviser.
+      float beta =
+          (curr_sum_[mi] == 0.f || curr_sum_[mi] != curr_sum_[mi]) ? 1.f : 1.f / curr_sum_[mi];
+
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < REGS_PER_THREAD; ++ii) {
+          // The accumulators. Convert from FP16x2 to FP32x2.
+          float2 local_o_pair = fmha::half2_to_float2(local_acc_o[mi][ni].reg(ii));
+          float2 acc_o_pair = fmha::half2_to_float2(acc_o[mi][ni].reg(ii));
+
+          // Apply the scaling.
+          acc_o_pair.x = (alpha * acc_o_pair.x + local_o_pair.x) * beta;
+          acc_o_pair.y = (alpha * acc_o_pair.y + local_o_pair.y) * beta;
+
+          // Update the register after converting back to FP16x2.
+          acc_o[mi][ni].reg(ii) = fmha::float2_to_half2(acc_o_pair);
+        }
+      }
+    }
+#endif  // defined HALF_ACCUMULATION_FOR_FLASH_ATTENTION
+  }
+
+  // Update max scale
+  inline __device__ void update_acc_max() {
+#pragma unroll
+    for (int row_i = 0; row_i < ROWS_PER_THREAD; ++row_i) {
+      float pre_curr_max_ = curr_max_[row_i];
+      curr_max_[row_i] = fmaxf(prev_max_[row_i], curr_max_[row_i]);
+      prev_max_[row_i] = pre_curr_max_;
+    }
+  }
+
+  // Update max scale
+  inline __device__ void update_acc_sum() {
+#pragma unroll
+    for (int row_i = 0; row_i < ROWS_PER_THREAD; ++row_i) {
+      float pre_curr_sum_ = curr_sum_[row_i];
+      curr_sum_[row_i] =
+          __expf(prev_max_[row_i] - curr_max_[row_i]) * curr_sum_[row_i] + prev_sum_[row_i];
+      prev_sum_[row_i] = pre_curr_sum_;
+    }
+  }
+
+  // updater scales
+  float curr_max_[ROWS_PER_THREAD] = {-HUGE_VALF};
+  float curr_sum_[ROWS_PER_THREAD] = {0};
+  float prev_max_[ROWS_PER_THREAD] = {-HUGE_VALF};
+  float prev_sum_[ROWS_PER_THREAD] = {0};
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Data_type_, int SIZE_IN_BYTES_>
+struct Fragment_from_size_in_bytes {
+  using Type = Fragment<Data_type_, SIZE_IN_BYTES_ / static_cast<int>(sizeof(Data_type_))>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int SIZE_IN_BYTES_>
+struct Fragment_from_size_in_bytes<bool, SIZE_IN_BYTES_> {
+  using Type = Fragment<bool, SIZE_IN_BYTES_ * 8>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Fragment, int M, int N>
+inline __device__ void clear(Fragment (&frag)[M][N]) {
+#pragma unroll
+  for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+    for (int ni = 0; ni < N; ++ni) {
+      frag[mi][ni].clear();
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Accumulator_type, int WARPS_K>
+struct Clear_accumulator {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int WARPS_K>
+struct Clear_accumulator<uint16_t, WARPS_K> {
+  template <typename Acc, int M, int N>
+  static inline __device__ void apply(Acc (&acc)[M][N], bool = false) {
+    fmha::clear(acc);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int WARPS_K>
+struct Clear_accumulator<fmha::bf16_t, WARPS_K> {
+  template <typename Acc, int M, int N>
+  static inline __device__ void apply(Acc (&acc)[M][N], bool = false) {
+    fmha::clear(acc);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int WARPS_K>
+struct Clear_accumulator<float, WARPS_K> {
+  template <typename Acc, int M, int N>
+  static inline __device__ void apply(Acc (&acc)[M][N], bool = false) {
+    fmha::clear(acc);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int WARPS_K>
+struct Clear_accumulator<int32_t, WARPS_K> {
+  template <typename Acc, int M, int N>
+  static inline __device__ void apply(Acc (&acc)[M][N], bool enable_i2f_trick = true) {
+#if defined(USE_I2F_EMULATION_TRICK)
+    if (enable_i2f_trick) {
+#pragma unroll
+      for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+        for (int ni = 0; ni < N; ++ni) {
+#pragma unroll
+          for (int ii = 0; ii < Acc::NUM_REGS; ++ii) {
+            acc[mi][ni].reg(ii) = uint32_t(FP32_I2F_MAGIC_NUMBER_HEX) / WARPS_K;
+          }
+        }
+      }
+    } else
+#endif  // defined(USE_I2F_EMULATION_TRICK)
+    {
+      fmha::clear(acc);
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/gemm.h b/csrc/fmha_v2/fmha/gemm.h
new file mode 100644
index 0000000000..e1422e4f6e
--- /dev/null
+++ b/csrc/fmha_v2/fmha/gemm.h
@@ -0,0 +1,35 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/fragment.h>
+#include <fmha/utils.h>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Acc, typename A, typename B, int M, int N>
+inline __device__ void gemm(Acc (&acc)[M][N], A const (&a)[M], B const (&b)[N]) {
+#pragma unroll
+  for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+    for (int ni = 0; ni < N; ++ni) {
+      acc[mi][ni].mma(a[mi], b[ni]);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/gmem_tile_o.h b/csrc/fmha_v2/fmha/gmem_tile_o.h
new file mode 100644
index 0000000000..c3177dc219
--- /dev/null
+++ b/csrc/fmha_v2/fmha/gmem_tile_o.h
@@ -0,0 +1,465 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/traits.h>
+#include <fmha/utils.h>
+
+namespace fmha {
+namespace v1 {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// H M M A
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int CTAS_PER_HEAD>
+struct Hmma_gmem_tile_o {
+  // The mma tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The size of each element.
+  enum { BYTES_PER_ELEMENT = 2 };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = Cta_tile::N * BYTES_PER_ELEMENT };
+
+  // The size of each STG.
+  enum { BYTES_PER_STG = 16 };
+
+  // The number of threads to store a "row" of the matrix.
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_STG };
+
+  // The number of "rows" stored per STG.
+  enum { ROWS_PER_STG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // The number of "rows" stored per iteration of the loop.
+  enum { ROWS = Cta_tile::M };
+
+  // We want at least one output per thread (if possible).
+  enum { ROWS_PER_LOOP_ = ROWS <= 64 ? ROWS : (int)Min<ROWS, ROWS_PER_STG>::VALUE };
+
+  // We also want to have "complete" MMAs.
+  enum { ROWS_PER_LOOP = Max<ROWS_PER_LOOP_, Mma_tile::M_PER_MMA_PER_CTA>::VALUE };
+
+  // The number of outer loop for the stores.
+  enum { LOOPS = fmha::Div_up<ROWS, ROWS_PER_LOOP>::VALUE };
+
+  // DEBUG.
+  static_assert(ROWS % ROWS_PER_LOOP == 0, "");
+  // END OF DEBUG.
+
+  // Make sure the math is correct.
+  static_assert(ROWS_PER_LOOP >= (int)Mma_tile::M_PER_MMA_PER_CTA, "");
+
+  // Do we have to guard against partial writes/reads.
+  enum { HAS_INCOMPLETE_STG = Cta_tile::M % ROWS_PER_STG != 0 };
+
+  // The number of STGs needed to store a chunk of the Q matrix.
+  enum { STGS_PER_LOOP = fmha::Div_up<ROWS_PER_LOOP, ROWS_PER_STG>::VALUE };
+
+  // The number of STGs needed to store a chunk of the Q matrix in total.
+  enum { STGS = STGS_PER_LOOP * LOOPS };
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Hmma_gmem_tile_o(Params const& params, Block_info const& binfo, int tidx,
+                                     int cta_row_offset = 0)
+      : params_o_stride_in_bytes_(params.o_stride_in_bytes),
+        o_ptr_(reinterpret_cast<char*>(params.o_ptr)) {
+    // Compute the position in the sequence (within the CTA for the moment).
+    int row = tidx / THREADS_PER_ROW;
+    // Compute the position of the thread in the row.
+    int col = tidx % THREADS_PER_ROW;
+
+    // Is that thread active on the last STG?
+    if (HAS_INCOMPLETE_STG) {
+      is_active_for_last_stg_ = row + (STGS - 1) * ROWS_PER_STG < Cta_tile::M;
+    }
+
+    // Account for the CTA-wide row offset (no loop mode).
+    row += cta_row_offset;
+
+    // The row offset in the batched GEMM.
+    int64_t row_offset = (int64_t)row * params.o_stride_in_bytes;
+    // Take the batch/head offset into account.
+    row_offset += (int64_t)binfo.bidx * BYTES_PER_ROW;
+    // Assemble the final pointer.
+    o_ptr_ += row_offset + col * BYTES_PER_STG;
+  }
+
+  // Load data from global memory.
+  inline __device__ void load(uint4 (&dst)[STGS_PER_LOOP], int mi) {
+    if (blockIdx.x == 0) {
+#pragma unroll
+      for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+        dst[ii] = make_uint4(0u, 0u, 0u, 0u);
+      }
+    } else {
+#pragma unroll
+      for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+        int jj = mi * STGS_PER_LOOP + ii;
+        if (!HAS_INCOMPLETE_STG || (jj < STGS - 1 || is_active_for_last_stg_)) {
+          fmha::ldg(dst[ii], o_ptr_ + jj * ROWS_PER_STG * params_o_stride_in_bytes_);
+        }
+      }
+    }
+  }
+
+  // Store data to global memory.
+  inline __device__ void store(uint4 const (&src)[STGS_PER_LOOP], int mi) {
+#pragma unroll
+    for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+      int jj = mi * STGS_PER_LOOP + ii;
+      if (!HAS_INCOMPLETE_STG || (jj < STGS - 1 || is_active_for_last_stg_)) {
+        fmha::stg(o_ptr_ + jj * ROWS_PER_STG * params_o_stride_in_bytes_, src[ii]);
+      }
+    }
+  }
+
+  // Store data to global memory.
+  inline __device__ void store(uint4 const (&src)[STGS_PER_LOOP], uint4 const (&old)[STGS_PER_LOOP],
+                               int mi) {
+    uint4 tmp[STGS_PER_LOOP];
+#pragma unroll
+    for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+      tmp[ii].x = fmha::hadd2(src[ii].x, old[ii].x);
+      tmp[ii].y = fmha::hadd2(src[ii].y, old[ii].y);
+      tmp[ii].z = fmha::hadd2(src[ii].z, old[ii].z);
+      tmp[ii].w = fmha::hadd2(src[ii].w, old[ii].w);
+    }
+    this->store(tmp, mi);
+  }
+
+  // Move the pointer to the next location.
+  inline __device__ void move() { o_ptr_ += (int64_t)ROWS * params_o_stride_in_bytes_; }
+
+  // The stride between rows for the QKV matrice.
+  int64_t const params_o_stride_in_bytes_;
+  // The pointer.
+  char* o_ptr_;
+  // Is the thread active for the last STG?
+  int is_active_for_last_stg_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Volta_hmma_fp16_16x16x16_traits, Cta_tile, CTAS_PER_HEAD>
+    : public Hmma_gmem_tile_o<fmha::Volta_hmma_fp16_16x16x16_traits, Cta_tile, CTAS_PER_HEAD> {
+  // The traits.
+  using Traits = fmha::Volta_hmma_fp16_16x16x16_traits;
+  // The base class.
+  using Base = Hmma_gmem_tile_o<Traits, Cta_tile, CTAS_PER_HEAD>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& binfo, int tidx,
+                                int cta_row_offset = 0)
+      : Base(params, binfo, tidx, cta_row_offset) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Turing_hmma_fp16_traits, Cta_tile, CTAS_PER_HEAD>
+    : public Hmma_gmem_tile_o<fmha::Turing_hmma_fp16_traits, Cta_tile, CTAS_PER_HEAD> {
+  // The traits.
+  using Traits = fmha::Turing_hmma_fp16_traits;
+  // The base class.
+  using Base = Hmma_gmem_tile_o<Traits, Cta_tile, CTAS_PER_HEAD>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& binfo, int tidx,
+                                int cta_row_offset = 0)
+      : Base(params, binfo, tidx, cta_row_offset) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Ampere_hmma_fp16_traits, Cta_tile, CTAS_PER_HEAD>
+    : public Hmma_gmem_tile_o<fmha::Ampere_hmma_fp16_traits, Cta_tile, CTAS_PER_HEAD> {
+  // The traits.
+  using Traits = fmha::Ampere_hmma_fp16_traits;
+  // The base class.
+  using Base = Hmma_gmem_tile_o<Traits, Cta_tile, CTAS_PER_HEAD>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& binfo, int tidx,
+                                int cta_row_offset = 0)
+      : Base(params, binfo, tidx, cta_row_offset) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// I M M A
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int CTAS_PER_HEAD>
+struct Imma_gmem_tile_o {
+  // The mma tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The size of each element.
+  enum { BYTES_PER_ELEMENT = 1 };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = Cta_tile::N * BYTES_PER_ELEMENT };
+
+  // The size of each STG.
+  enum { BYTES_PER_STG = 4 };
+
+  // The number of threads to store a "row" of the matrix.
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_STG };
+
+  // The number of "rows" stored per STG.
+  enum { ROWS_PER_STG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // The number of "rows" stored per iteration of the loop. The output of 1 MMA.
+  enum { ROWS = Cta_tile::M };
+
+  // We want at least one output per thread (if possible).
+  enum { ROWS_PER_LOOP_ = ROWS <= 64 ? ROWS : (int)Min<ROWS, ROWS_PER_STG>::VALUE };
+
+  // We also want to have "complete" MMAs.
+  enum { ROWS_PER_LOOP = Max<ROWS_PER_LOOP_, Mma_tile::M_PER_MMA_PER_CTA>::VALUE };
+
+  // The number of outer loop for the stores.
+  enum { LOOPS = fmha::Div_up<ROWS, ROWS_PER_LOOP>::VALUE };
+
+  // DEBUG.
+  static_assert(ROWS % ROWS_PER_LOOP == 0, "");
+  // END OF DEBUG.
+
+  // Make sure the math is correct.
+  static_assert(ROWS_PER_LOOP >= (int)Mma_tile::M_PER_MMA_PER_CTA, "");
+
+  // Do we have to guard against partial writes/reads (last STG).
+  enum { HAS_INCOMPLETE_STG = Cta_tile::M % ROWS_PER_STG != 0 };
+
+  // The number of STGs needed to store a chunk of the Q matrix.
+  enum { STGS_PER_LOOP = fmha::Div_up<ROWS_PER_LOOP, ROWS_PER_STG>::VALUE };
+
+  // The number of STGs needed to store a chunk of the Q matrix in total.
+  enum { STGS = STGS_PER_LOOP * LOOPS };
+
+  // Are all threads active?
+  enum { ALL_THREADS_ACTIVE = ROWS_PER_STG <= ROWS_PER_LOOP };
+
+  // The number of active threads.
+  enum { ACTIVE_THREADS_ = Cta_tile::THREADS_PER_CTA * ROWS_PER_LOOP / ROWS_PER_STG };
+
+  // The number of active threads.
+  enum { ACTIVE_THREADS = ALL_THREADS_ACTIVE ? Cta_tile::THREADS_PER_CTA : ACTIVE_THREADS_ };
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Imma_gmem_tile_o(Params const& params, int bidx, int tidx, int cta_row_offset)
+      : params_o_stride_in_bytes_(params.o_stride_in_bytes),
+        params_scale_bmm2_(params.scale_bmm2),
+        params_enable_i2f_trick_(params.enable_i2f_trick),
+        o_ptr_(reinterpret_cast<char*>(params.o_ptr))
+#if USE_DEMO_BERT_PARAMS
+        ,
+        o_scratch_ptr_(nullptr) {
+#else
+        ,
+        o_scratch_ptr_(reinterpret_cast<int32_t*>(params.o_scratch_ptr)) {
+#endif
+
+    // Compute the position in the sequence (within the CTA for the moment).
+    int row = tidx / THREADS_PER_ROW;
+    // Compute the position of the thread in the row.
+    int col = tidx % THREADS_PER_ROW;
+
+    // Is it an active thread?
+    is_active_ = ALL_THREADS_ACTIVE || row < ROWS_PER_LOOP;
+
+    // Is that thread active on the last STG?
+    if (HAS_INCOMPLETE_STG) {
+      is_active_for_last_stg_ = row + (STGS - 1) * ROWS_PER_STG < Cta_tile::M;
+    }
+
+    // Update the row.
+    row += cta_row_offset;
+
+    // The row offset in the batched GEMM.
+    int64_t row_offset = (int64_t)row * params.o_stride_in_bytes;
+    // Take the batch/head offset into account.
+    row_offset += (int64_t)bidx * BYTES_PER_ROW;
+    // Assemble the final pointers.
+    o_ptr_ += row_offset + col * BYTES_PER_STG;
+
+    // For the scratch space, the pointer has int32 type so it accounts for the *4 factor.
+    o_scratch_ptr_ += blockIdx.y * STGS_PER_LOOP * ACTIVE_THREADS + tidx;
+  }
+
+  // Load data from global memory.
+  inline __device__ void load(uint4 (&dst)[STGS_PER_LOOP], int mi) {
+    if (blockIdx.x == 0) {
+#pragma unroll
+      for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+        dst[ii] = make_uint4(0u, 0u, 0u, 0u);
+      }
+    } else if (ALL_THREADS_ACTIVE || is_active_) {
+#pragma unroll
+      for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+        fmha::ldg(dst[ii], o_scratch_ptr_ + ii * ACTIVE_THREADS);
+      }
+    }
+  }
+
+  // Store data to global memory.
+  inline __device__ void store(uint4 const (&src)[STGS_PER_LOOP], int mi) {
+    // The scale.
+    float const& scale = reinterpret_cast<float const&>(params_scale_bmm2_);
+// Iterate over the different STGs.
+#pragma unroll
+    for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+      // The accumulators are in int32_t.
+      int4 const& val = reinterpret_cast<int4 const&>(src[ii]);
+
+      // Extract the floats and scale.
+      float f0, f1, f2, f3;
+#if defined(USE_I2F_EMULATION_TRICK)
+      if (params_enable_i2f_trick_) {
+        f0 = reinterpret_cast<float const&>(val.x) - FP32_I2F_MAGIC_NUMBER;
+        f1 = reinterpret_cast<float const&>(val.y) - FP32_I2F_MAGIC_NUMBER;
+        f2 = reinterpret_cast<float const&>(val.z) - FP32_I2F_MAGIC_NUMBER;
+        f3 = reinterpret_cast<float const&>(val.w) - FP32_I2F_MAGIC_NUMBER;
+      } else
+#endif  // defined(USE_I2F_EMULATION_TRICK)
+      {
+        f0 = static_cast<float>(val.x);
+        f1 = static_cast<float>(val.y);
+        f2 = static_cast<float>(val.z);
+        f3 = static_cast<float>(val.w);
+      }
+
+      // Apply the scaling.
+      f0 *= scale;
+      f1 *= scale;
+      f2 *= scale;
+      f3 *= scale;
+
+      // Convert the 4 floats to char4.
+      uint32_t dst = float4_to_char4<true>(f0, f1, f2, f3);
+
+      // Store the result.
+      int jj = mi * STGS_PER_LOOP + ii;
+      if (!HAS_INCOMPLETE_STG || (jj < STGS - 1 || is_active_for_last_stg_)) {
+        fmha::stg(o_ptr_ + jj * ROWS_PER_STG * params_o_stride_in_bytes_, dst);
+      }
+    }
+  }
+
+  // Store data to global memory.
+  inline __device__ void store(uint4 const (&src)[STGS_PER_LOOP], uint4 const (&old)[STGS_PER_LOOP],
+                               int mi) {
+    // Do the reduction.
+    uint4 tmp[STGS_PER_LOOP];
+#pragma unroll
+    for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+      int4 const& src_ii = reinterpret_cast<int4 const&>(src[ii]);
+      int4 const& old_ii = reinterpret_cast<int4 const&>(old[ii]);
+
+      int32_t x = src_ii.x + old_ii.x;
+      int32_t y = src_ii.y + old_ii.y;
+      int32_t z = src_ii.z + old_ii.z;
+      int32_t w = src_ii.w + old_ii.w;
+
+      tmp[ii].x = reinterpret_cast<uint32_t const&>(x);
+      tmp[ii].y = reinterpret_cast<uint32_t const&>(y);
+      tmp[ii].z = reinterpret_cast<uint32_t const&>(z);
+      tmp[ii].w = reinterpret_cast<uint32_t const&>(w);
+    }
+
+    // The last CTA stores INT8 values to the final location.
+    if (blockIdx.x == CTAS_PER_HEAD - 1) {
+      this->store(tmp, mi);
+
+      // Other CTAs store INT32 values to the scratch space.
+    } else if (ALL_THREADS_ACTIVE || is_active_) {
+#pragma unroll
+      for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+        fmha::stg(o_scratch_ptr_ + ii * ACTIVE_THREADS, tmp[ii]);
+      }
+    }
+  }
+
+  // Move the pointer.
+  inline __device__ void move() { o_ptr_ += (int64_t)ROWS * params_o_stride_in_bytes_; }
+
+  // The stride between rows for the QKV matrice.
+  int64_t const params_o_stride_in_bytes_;
+  // The scaling factor to convert to int8.
+  uint32_t const params_scale_bmm2_;
+  // Do we enable the i2f trick?
+  bool const params_enable_i2f_trick_;
+  // The pointer.
+  char* o_ptr_;
+  // The scratch pointer for 32-bit reductions.
+  int32_t* o_scratch_ptr_;
+
+  // Is it an active thread? When ROWS_PER_STG > ROWS_PER_LOOP, some threads do not store.
+  int is_active_, is_active_for_last_stg_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Turing_imma_int8_int32_traits, Cta_tile, CTAS_PER_HEAD>
+    : public Imma_gmem_tile_o<fmha::Turing_imma_int8_int32_traits, Cta_tile, CTAS_PER_HEAD> {
+  // The traits class.
+  using Traits = fmha::Turing_imma_int8_int32_traits;
+  // The base class.
+  using Base = Imma_gmem_tile_o<Traits, Cta_tile, CTAS_PER_HEAD>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& block_info, int tidx,
+                                int cta_row_offset = 0)
+      : Base(params, block_info.bidx, tidx, cta_row_offset) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Ampere_imma_int8_int32_traits, Cta_tile, CTAS_PER_HEAD>
+    : public Imma_gmem_tile_o<fmha::Ampere_imma_int8_int32_traits, Cta_tile, CTAS_PER_HEAD> {
+  // The traits class.
+  using Traits = fmha::Ampere_imma_int8_int32_traits;
+  // The base class.
+  using Base = Imma_gmem_tile_o<Traits, Cta_tile, CTAS_PER_HEAD>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& block_info, int tidx,
+                                int cta_row_offset = 0)
+      : Base(params, block_info.bidx, tidx, cta_row_offset) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace v1
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/gmem_tile_o_packed.h b/csrc/fmha_v2/fmha/gmem_tile_o_packed.h
new file mode 100644
index 0000000000..dc13b37f19
--- /dev/null
+++ b/csrc/fmha_v2/fmha/gmem_tile_o_packed.h
@@ -0,0 +1,1349 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/numeric_types.h>
+#include <fmha/traits.h>
+#include <fmha/utils.h>
+
+namespace fmha {
+namespace v2 {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// H M M A
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int CTAS_PER_HEAD, int BYTES_PER_STG_ = 16,
+          int BYTES_PER_ELEMENT_ = 2>
+struct Hmma_gmem_tile_o {
+  // The mma tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The size of each element.
+  enum { BYTES_PER_ELEMENT = BYTES_PER_ELEMENT_ };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = Cta_tile::N * BYTES_PER_ELEMENT };
+
+  // The valid size of a row in bytes.
+  // Note: cross-attention kernels rely on head dim from runtime instead of from compile-time.
+  // This approach deviates from self-attention kernels. To explore a unified approach.
+  // enum { VALID_BYTES_PER_ROW = Cta_tile::VALID_N * BYTES_PER_ELEMENT };
+
+  // The size of each STG.
+  enum { BYTES_PER_STG = BYTES_PER_STG_ };
+
+  // The number of threads to store a "row" of the matrix.
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_STG };
+
+  // The number of "rows" stored per STG.
+  enum { ROWS_PER_STG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // The number of "rows" stored per iteration of the loop. The output of 1 MMA.
+  enum { ROWS = Cta_tile::M };
+
+  // We want at least one output per thread (if possible).
+  enum { ROWS_PER_LOOP_ = ROWS <= 64 ? ROWS : (int)Min<ROWS, ROWS_PER_STG>::VALUE };
+
+  // We also want to have "complete" MMAs.
+  enum { ROWS_PER_LOOP = Max<ROWS_PER_LOOP_, Mma_tile::M_PER_MMA_PER_CTA>::VALUE };
+
+  // The number of outer loop for the stores.
+  enum { LOOPS = fmha::Div_up<ROWS, ROWS_PER_LOOP>::VALUE };
+
+  // DEBUG.
+  static_assert(ROWS % ROWS_PER_LOOP == 0, "");
+  // END OF DEBUG.
+
+  // Make sure the math is correct.
+  static_assert(ROWS_PER_LOOP >= (int)Mma_tile::M_PER_MMA_PER_CTA, "");
+
+  // Do we have to guard against partial writes/reads.
+  enum { HAS_INCOMPLETE_STG = Cta_tile::M % ROWS_PER_STG != 0 };
+
+  // The number of STGs needed to store a chunk of the Q matrix.
+  enum { STGS_PER_LOOP = fmha::Div_up<ROWS_PER_LOOP, ROWS_PER_STG>::VALUE };
+
+  // The number of STGs needed to store a chunk of the Q matrix in total.
+  enum { STGS = STGS_PER_LOOP * LOOPS };
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Hmma_gmem_tile_o(Params const& params, Block_info const& binfo, int tidx,
+                                     int cta_row_offset, int cta_col_offset_in_bytes = 0)
+      : params_o_stride_in_bytes_(params.o_stride_in_bytes),
+        actual_seqlen_(binfo.actual_q_seqlen),
+        o_ptr_(reinterpret_cast<char*>(params.o_ptr)) {
+    // Compute the position in the sequence (within the CTA for the moment).
+    int row = tidx / THREADS_PER_ROW;
+    // Compute the position of the thread in the row.
+    int col = tidx % THREADS_PER_ROW;
+
+    // Is that thread active on the last STG?
+    if (HAS_INCOMPLETE_STG) {
+      is_active_for_last_stg_ = row + (STGS - 1) * ROWS_PER_STG < Cta_tile::M;
+    }
+
+    // Store the row/col to update the predicates in load.
+    row_ = cta_row_offset + row;
+    col_in_bytes_ = cta_col_offset_in_bytes + col * BYTES_PER_STG;
+    init_row_ = row_;
+
+    // The row offset in the batched GEMM.
+    int64_t row_offset = (int64_t)row_ * params.o_stride_in_bytes;
+    // The amount of bytes per row without padding.
+    int const valid_bytes_per_row = params.dv * BYTES_PER_ELEMENT;
+    // Take the batch/head offset into account. TODO: Fix me!
+    //
+    // row_offset += binfo.bidx * VALID_BYTES_PER_ROW;
+    //
+    row_offset += binfo.bidx * valid_bytes_per_row;
+
+    // Assemble the final pointer.
+    o_ptr_ += row_offset + col_in_bytes_;
+    init_o_ptr_ = o_ptr_;
+
+    // Do not store if the thread is in the padded area
+    active_ = col_in_bytes_ < valid_bytes_per_row;
+  }
+
+  // Load data from global memory.
+  inline __device__ void load(uint4 (&dst)[STGS_PER_LOOP], int mi) {
+    if (blockIdx.x == 0) {
+#pragma unroll
+      for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+        dst[ii] = make_uint4(0u, 0u, 0u, 0u);
+      }
+    } else {
+#pragma unroll
+      for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+        int jj = mi * STGS_PER_LOOP + ii;
+        if (row_ + jj * ROWS_PER_STG >= actual_seqlen_) {
+          break;
+        }
+        if (active_ && (!HAS_INCOMPLETE_STG || (jj < STGS - 1 || is_active_for_last_stg_))) {
+          fmha::ldg(dst[ii], o_ptr_ + jj * ROWS_PER_STG * params_o_stride_in_bytes_);
+        }
+      }
+    }
+  }
+
+  // Store data to global memory.
+  inline __device__ void store(uint4 const (&src)[STGS_PER_LOOP], int mi) {
+#pragma unroll
+    for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+      int jj = mi * STGS_PER_LOOP + ii;
+      if (row_ + jj * ROWS_PER_STG >= actual_seqlen_) {
+        break;
+      }
+      if (active_ && (!HAS_INCOMPLETE_STG || (jj < STGS - 1 || is_active_for_last_stg_))) {
+        fmha::stg(o_ptr_ + jj * ROWS_PER_STG * params_o_stride_in_bytes_, src[ii]);
+      }
+    }
+  }
+
+  // Store data to global memory.
+  inline __device__ void store(uint4 const (&src)[STGS_PER_LOOP], uint4 const (&old)[STGS_PER_LOOP],
+                               int mi) {
+    uint4 tmp[STGS_PER_LOOP];
+#pragma unroll
+    for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+      tmp[ii].x = fmha::hadd2(src[ii].x, old[ii].x);
+      tmp[ii].y = fmha::hadd2(src[ii].y, old[ii].y);
+      tmp[ii].z = fmha::hadd2(src[ii].z, old[ii].z);
+      tmp[ii].w = fmha::hadd2(src[ii].w, old[ii].w);
+    }
+    this->store(tmp, mi);
+  }
+
+  // Move the pointer to the next location.
+  inline __device__ void move(int const steps = 1) {
+    row_ += ROWS * steps;
+    o_ptr_ += (int64_t)ROWS * params_o_stride_in_bytes_ * steps;
+  }
+
+  inline __device__ void move_to(int const step) {
+    row_ = init_row_ + ROWS * step;
+    o_ptr_ = init_o_ptr_ + (int64_t)ROWS * params_o_stride_in_bytes_ * step;
+  }
+
+  // The stride between rows for the QKV matrice.
+  int64_t params_o_stride_in_bytes_;
+  // The pointer.
+  char* o_ptr_;
+  char* init_o_ptr_;
+  // Is the thread active for the last STG?
+  int is_active_for_last_stg_;
+
+  // The row loaded by this thread.
+  int row_, col_in_bytes_;
+  int init_row_;
+  // The length of the sequence loaded by that CTA.
+  int actual_seqlen_;
+  // Is that thread active when it comes to loading data?
+  int active_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Volta_hmma_fp16_16x16x16_traits, Cta_tile, CTAS_PER_HEAD>
+    : public Hmma_gmem_tile_o<fmha::Volta_hmma_fp16_16x16x16_traits, Cta_tile, CTAS_PER_HEAD> {
+  // The traits.
+  using Traits = fmha::Volta_hmma_fp16_16x16x16_traits;
+  // The base class.
+  using Base = Hmma_gmem_tile_o<Traits, Cta_tile, CTAS_PER_HEAD>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& block_info, int tidx,
+                                int cta_row_offset = 0)
+      : Base(params, block_info, tidx, cta_row_offset) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Turing_hmma_fp16_traits, Cta_tile, CTAS_PER_HEAD>
+    : public Hmma_gmem_tile_o<fmha::Turing_hmma_fp16_traits, Cta_tile, CTAS_PER_HEAD> {
+  // The traits.
+  using Traits = fmha::Turing_hmma_fp16_traits;
+  // The base class.
+  using Base = Hmma_gmem_tile_o<Traits, Cta_tile, CTAS_PER_HEAD>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& block_info, int tidx,
+                                int cta_row_offset = 0, int cta_col_offset_in_bytes = 0)
+      : Base(params, block_info, tidx, cta_row_offset, cta_col_offset_in_bytes) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Ampere_hmma_fp16_traits, Cta_tile, CTAS_PER_HEAD>
+    : public Hmma_gmem_tile_o<fmha::Ampere_hmma_fp16_traits, Cta_tile, CTAS_PER_HEAD> {
+  // The traits.
+  using Traits = fmha::Ampere_hmma_fp16_traits;
+  // The base class.
+  using Base = Hmma_gmem_tile_o<Traits, Cta_tile, CTAS_PER_HEAD>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& block_info, int tidx,
+                                int cta_row_offset = 0, int cta_col_offset_in_bytes = 0)
+      : Base(params, block_info, tidx, cta_row_offset, cta_col_offset_in_bytes) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Ampere_hmma_bf16_bf16_traits, Cta_tile, CTAS_PER_HEAD>
+    : public Hmma_gmem_tile_o<fmha::Ampere_hmma_bf16_bf16_traits, Cta_tile, CTAS_PER_HEAD> {
+  // The traits.
+  using Traits = fmha::Ampere_hmma_bf16_bf16_traits;
+  // The base class.
+  using Base = Hmma_gmem_tile_o<Traits, Cta_tile, CTAS_PER_HEAD>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& block_info, int tidx,
+                                int cta_row_offset = 0, int cta_col_offset_in_bytes = 0)
+      : Base(params, block_info, tidx, cta_row_offset, cta_col_offset_in_bytes) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Ampere_hmma_fp32_traits, Cta_tile, CTAS_PER_HEAD>
+    : public Hmma_gmem_tile_o<fmha::Ampere_hmma_fp32_traits, Cta_tile, CTAS_PER_HEAD, 8> {
+  // The traits.
+  using Traits = fmha::Ampere_hmma_fp32_traits;
+  // The base class.
+  using Base = Hmma_gmem_tile_o<Traits, Cta_tile, CTAS_PER_HEAD, 8>;
+
+  // The epilogue data type
+  using Epilogue_type = typename Traits::Epilogue_type;
+
+  // DEBUG.
+  static_assert((Base::THREADS_PER_ROW == 16 || Base::THREADS_PER_ROW == 32 ||
+                 Base::THREADS_PER_ROW == 64 || Base::THREADS_PER_ROW == 128) &&
+                    Base::BYTES_PER_STG == 8,
+                "");
+
+  // END OF DEBUG.
+
+  enum { STGS_PER_LOOP = Base::STGS_PER_LOOP };
+
+  enum { ROWS_PER_STG = Base::ROWS_PER_STG };
+
+  enum { STGS = Base::STGS };
+
+  enum { HAS_INCOMPLETE_STG = Base::HAS_INCOMPLETE_STG };
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& block_info, int tidx,
+                                int cta_row_offset = 0, int cta_col_offset_in_bytes = 0)
+      : Base(params, block_info, tidx, cta_row_offset, cta_col_offset_in_bytes) {}
+
+  // Load data from global memory.
+  inline __device__ void load(uint4 const (&dst)[STGS_PER_LOOP], int mi) {
+    static_assert(CTAS_PER_HEAD == 1, "Not implemented");
+  }
+
+  // Store data to global memory.
+  inline __device__ void store(uint4 const (&src)[STGS_PER_LOOP], int mi) {
+#pragma unroll
+    for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+      int jj = mi * STGS_PER_LOOP + ii;
+      if (this->row_ + jj * ROWS_PER_STG >= this->actual_seqlen_) {
+        break;
+      }
+
+      float x = reinterpret_cast<float const&>(src[ii].x);
+      float y = reinterpret_cast<float const&>(src[ii].y);
+      float z = reinterpret_cast<float const&>(src[ii].z);
+      float w = reinterpret_cast<float const&>(src[ii].w);
+
+      uint2 out = float4_to_16bit_x4<Epilogue_type>(x, y, z, w);
+      if (this->active_ &&
+          (!HAS_INCOMPLETE_STG || (jj < STGS - 1 || this->is_active_for_last_stg_))) {
+        fmha::stg(this->o_ptr_ + jj * ROWS_PER_STG * this->params_o_stride_in_bytes_, out);
+      }
+    }
+  }
+
+  // Store data to global memory.
+  inline __device__ void store(uint4 const (&src)[STGS_PER_LOOP], uint4 const (&old)[STGS_PER_LOOP],
+                               int mi) {
+    static_assert(CTAS_PER_HEAD == 1, "Not implemented");
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Ampere_hmma_bf16_traits, Cta_tile, CTAS_PER_HEAD>
+    : public Hmma_gmem_tile_o<fmha::Ampere_hmma_bf16_traits, Cta_tile, CTAS_PER_HEAD, 8> {
+  // The traits.
+  using Traits = fmha::Ampere_hmma_bf16_traits;
+  // The base class.
+  using Base = Hmma_gmem_tile_o<Traits, Cta_tile, CTAS_PER_HEAD, 8>;
+
+  // The epilogue data type
+  using Epilogue_type = typename Traits::Epilogue_type;
+
+  // DEBUG.
+  static_assert((Base::THREADS_PER_ROW == 16 || Base::THREADS_PER_ROW == 32 ||
+                 Base::THREADS_PER_ROW == 64 || Base::THREADS_PER_ROW == 128) &&
+                    Base::BYTES_PER_STG == 8,
+                "");
+
+  // END OF DEBUG.
+
+  enum { STGS_PER_LOOP = Base::STGS_PER_LOOP };
+
+  enum { ROWS_PER_STG = Base::ROWS_PER_STG };
+
+  enum { STGS = Base::STGS };
+
+  enum { HAS_INCOMPLETE_STG = Base::HAS_INCOMPLETE_STG };
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& block_info, int tidx,
+                                int cta_row_offset = 0, int cta_col_offset_in_bytes = 0)
+      : Base(params, block_info, tidx, cta_row_offset, cta_col_offset_in_bytes) {}
+
+  // Load data from global memory.
+  inline __device__ void load(uint4 const (&dst)[STGS_PER_LOOP], int mi) {
+    static_assert(CTAS_PER_HEAD == 1, "Not implemented");
+  }
+
+  // Store data to global memory.
+  inline __device__ void store(uint4 const (&src)[STGS_PER_LOOP], int mi) {
+#pragma unroll
+    for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+      int jj = mi * STGS_PER_LOOP + ii;
+      if (this->row_ + jj * ROWS_PER_STG >= this->actual_seqlen_) {
+        break;
+      }
+
+      float x = reinterpret_cast<float const&>(src[ii].x);
+      float y = reinterpret_cast<float const&>(src[ii].y);
+      float z = reinterpret_cast<float const&>(src[ii].z);
+      float w = reinterpret_cast<float const&>(src[ii].w);
+
+      uint2 out = float4_to_16bit_x4<Epilogue_type>(x, y, z, w);
+      if (this->active_ &&
+          (!HAS_INCOMPLETE_STG || (jj < STGS - 1 || this->is_active_for_last_stg_))) {
+        fmha::stg(this->o_ptr_ + jj * ROWS_PER_STG * this->params_o_stride_in_bytes_, out);
+      }
+    }
+  }
+
+  // Store data to global memory.
+  inline __device__ void store(uint4 const (&src)[STGS_PER_LOOP], uint4 const (&old)[STGS_PER_LOOP],
+                               int mi) {
+    static_assert(CTAS_PER_HEAD == 1, "Not implemented");
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// I M M A
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint32_t quantize(int4 const val, float const scale,
+                                    bool const params_enable_i2f_trick) {
+  // Extract the floats and scale.
+  float f0, f1, f2, f3;
+#if defined(USE_I2F_EMULATION_TRICK)
+  if (params_enable_i2f_trick) {
+    f0 = reinterpret_cast<float const&>(val.x) - FP32_I2F_MAGIC_NUMBER;
+    f1 = reinterpret_cast<float const&>(val.y) - FP32_I2F_MAGIC_NUMBER;
+    f2 = reinterpret_cast<float const&>(val.z) - FP32_I2F_MAGIC_NUMBER;
+    f3 = reinterpret_cast<float const&>(val.w) - FP32_I2F_MAGIC_NUMBER;
+  } else
+#endif  // defined(USE_I2F_EMULATION_TRICK)
+  {
+    f0 = static_cast<float>(val.x);
+    f1 = static_cast<float>(val.y);
+    f2 = static_cast<float>(val.z);
+    f3 = static_cast<float>(val.w);
+  }
+
+  // Apply the scaling.
+  f0 *= scale;
+  f1 *= scale;
+  f2 *= scale;
+  f3 *= scale;
+
+  // Convert the 4 floats to char4.
+  uint32_t dst = float4_to_char4<true>(f0, f1, f2, f3);
+
+  return dst;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Helpers to pack 4 registers representing a Src_type into a destination register with 4 8bit
+// values representing Dst_type. Scale factor is assumed to be always FP32 for 32-bit accumulators.
+template <typename Src_type, typename Dst_type, bool SCALE = true>
+struct Acc_packer {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Signed INT32 => INT8.
+template <>
+struct Acc_packer<int32_t, int8_t, true> {
+  template <typename This>
+  static inline __device__ uint32_t run(This const* this_, uint4 const& src_regs) {
+    float const& scale = reinterpret_cast<float const&>(this_->params_scale_bmm2_);
+    // The accumulators are in int32_t.
+    int4 const& val = reinterpret_cast<int4 const&>(src_regs);
+
+    // Quantize...
+    uint32_t dst = quantize(val, scale, this_->params_enable_i2f_trick_);
+    return dst;
+  }
+};
+
+template <>
+struct Acc_packer<int32_t, int8_t, false> {
+  template <typename This>
+  static inline __device__ uint32_t run(This const* this_, uint4 const& src_regs) {
+    // The accumulators are in int32_t.
+    int4 const& val = reinterpret_cast<int4 const&>(src_regs);
+
+    // Quantize...
+    uint32_t dst = quantize(val, 1.0f, this_->params_enable_i2f_trick_);
+    return dst;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// FP32 => FP8.
+template <>
+struct Acc_packer<float, fmha::e4m3_t, true> {
+  template <typename This>
+  static inline __device__ uint32_t run(This const* this_, uint4 const& src_regs) {
+    float const scale = reinterpret_cast<float const&>(this_->params_scale_bmm2_);
+
+    float4 const& val = reinterpret_cast<float4 const&>(src_regs);
+
+    uint32_t dst =
+        fmha::float4_to_e4m3x4(val.x * scale, val.y * scale, val.z * scale, val.w * scale);
+    return dst;
+  }
+
+  template <typename This>
+  static inline __device__ uint16_t run(This const* this_, uint2 const& src_regs) {
+    float const& scale = reinterpret_cast<float const&>(this_->params_scale_bmm2_);
+
+    float2 const& val = reinterpret_cast<float2 const&>(src_regs);
+
+    uint16_t dst = fmha::float2_to_e4m3x2(val.x * scale, val.y * scale);
+    return dst;
+  }
+};
+
+// FP32 => FP8.
+template <>
+struct Acc_packer<float, fmha::e4m3_t, false> {
+  template <typename This>
+  static inline __device__ uint32_t run(This const* this_, uint4 const& src_regs) {
+    float4 const& val = reinterpret_cast<float4 const&>(src_regs);
+
+    uint32_t dst = fmha::float4_to_e4m3x4(val.x, val.y, val.z, val.w);
+    return dst;
+  }
+
+  template <typename This>
+  static inline __device__ uint16_t run(This const* this_, uint2 const& src_regs) {
+    float2 const& val = reinterpret_cast<float2 const&>(src_regs);
+
+    uint16_t dst = fmha::float2_to_e4m3x2(val.x, val.y);
+    return dst;
+  }
+};
+
+// FP16 => FP8.
+template <>
+struct Acc_packer<uint16_t, fmha::e4m3_t, true> {
+  template <typename This>
+  static inline __device__ uint2 run(This const* this_, uint4 const& src_regs) {
+    uint2 dst;
+    dst.x = fmha::half4_to_e4m3x4(fmha::hmul2(src_regs.x, this_->params_scale_bmm2_),
+                                  fmha::hmul2(src_regs.y, this_->params_scale_bmm2_));
+    dst.y = fmha::half4_to_e4m3x4(fmha::hmul2(src_regs.z, this_->params_scale_bmm2_),
+                                  fmha::hmul2(src_regs.w, this_->params_scale_bmm2_));
+
+    return dst;
+  }
+};
+
+// FP16 => FP8.
+template <>
+struct Acc_packer<uint16_t, fmha::e4m3_t, false> {
+  template <typename This>
+  static inline __device__ uint2 run(This const* this_, uint4 const& src_regs) {
+    uint2 dst;
+    dst.x = fmha::half4_to_e4m3x4(src_regs.x, src_regs.y);
+    dst.y = fmha::half4_to_e4m3x4(src_regs.z, src_regs.w);
+
+    return dst;
+  }
+};
+
+template <>
+struct Acc_packer<float, fmha::e5m2_t, true> {
+  template <typename This>
+  static inline __device__ uint32_t run(This const* this_, uint4 const& src_regs) {
+    float const& scale = reinterpret_cast<float const&>(this_->params_scale_bmm2_);
+
+    float4 const& val = reinterpret_cast<float4 const&>(src_regs);
+
+    uint32_t dst =
+        fmha::float4_to_e5m2x4(val.x * scale, val.y * scale, val.z * scale, val.w * scale);
+    return dst;
+  }
+};
+
+template <>
+struct Acc_packer<float, fmha::e5m2_t, false> {
+  template <typename This>
+  static inline __device__ uint32_t run(This const* this_, uint4 const& src_regs) {
+    float4 const& val = reinterpret_cast<float4 const&>(src_regs);
+
+    uint32_t dst = fmha::float4_to_e5m2x4(val.x, val.y, val.z, val.w);
+    return dst;
+  }
+};
+
+template <>
+struct Acc_packer<float, uint16_t, false> {
+  template <typename This>
+  static inline __device__ uint2 run(This const* this_, uint4 const& src_regs) {
+    float4 const& val = reinterpret_cast<float4 const&>(src_regs);
+
+    uint2 dst = fmha::float4_to_half4(val.x, val.y, val.z, val.w);
+    return dst;
+  }
+};
+
+template <>
+struct Acc_packer<float, uint16_t, true> {
+  template <typename This>
+  static inline __device__ uint2 run(This const* this_, uint4 const& src_regs) {
+    float const& scale = reinterpret_cast<float const&>(this_->params_scale_bmm2_);
+
+    float4 const& val = reinterpret_cast<float4 const&>(src_regs);
+
+    uint2 dst = fmha::float4_to_half4(val.x * scale, val.y * scale, val.z * scale, val.w * scale);
+    return dst;
+  }
+};
+
+template <>
+struct Acc_packer<float, nv_bfloat16, false> {
+  template <typename This>
+  static inline __device__ uint2 run(This const* this_, uint4 const& src_regs) {
+    float4 const& val = reinterpret_cast<float4 const&>(src_regs);
+
+    uint2 dst = fmha::float4_to_16bit_x4<bf16_t>(val.x, val.y, val.z, val.w);
+    return dst;
+  }
+};
+
+template <>
+struct Acc_packer<float, nv_bfloat16, true> {
+  template <typename This>
+  static inline __device__ uint2 run(This const* this_, uint4 const& src_regs) {
+    float const& scale = reinterpret_cast<float const&>(this_->params_scale_bmm2_);
+
+    float4 const& val = reinterpret_cast<float4 const&>(src_regs);
+
+    uint2 dst = fmha::float4_to_16bit_x4<bf16_t>(val.x * scale, val.y * scale, val.z * scale,
+                                                 val.w * scale);
+    return dst;
+  }
+};
+
+// support both 32 bit accumulationi and 16 bit accumulation (imma and qmma)
+template <typename Traits, typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o_8bit {
+  // static_assert(sizeof(typename Traits::Accumulator_type) == 4);
+  static_assert(sizeof(typename Traits::C_type) == 1);
+
+  // The mma tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The size of each element.
+  enum { BYTES_PER_ELEMENT = 1 };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = Cta_tile::N * BYTES_PER_ELEMENT };
+
+  // The valid size of a row in bytes.
+  enum { VALID_BYTES_PER_ROW = Cta_tile::VALID_N * BYTES_PER_ELEMENT };
+
+  // The size of each STG (16B --> 8bit elements).
+  enum { BYTES_PER_STG = fmha::Div_up<16, sizeof(typename Traits::Accumulator_type)>::VALUE };
+
+  // The STG packed data type
+  using Stg_packed_type = typename Uint_from_size_in_bytes<BYTES_PER_STG>::Type;
+
+  // The number of threads to store a "row" of the matrix.
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_STG };
+
+  // The number of "rows" stored per STG.
+  enum { ROWS_PER_STG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // The number of "rows" stored per iteration of the loop. The output of 1 MMA.
+  enum { ROWS = Cta_tile::M };
+
+  // We want at least one output per thread (if possible).
+  enum { ROWS_PER_LOOP_ = ROWS <= 64 ? ROWS : (int)Min<ROWS, ROWS_PER_STG>::VALUE };
+
+  // We also want to have "complete" MMAs.
+  enum { ROWS_PER_LOOP = Max<ROWS_PER_LOOP_, Mma_tile::M_PER_MMA_PER_CTA>::VALUE };
+
+  // The number of outer loop for the stores.
+  enum { LOOPS = fmha::Div_up<ROWS, ROWS_PER_LOOP>::VALUE };
+
+  // DEBUG.
+  static_assert(ROWS % ROWS_PER_LOOP == 0, "");
+
+  // Make sure the math is correct.
+  static_assert(ROWS_PER_LOOP >= (int)Mma_tile::M_PER_MMA_PER_CTA, "");
+
+  // Do we have to guard against partial writes/reads.
+  enum { HAS_INCOMPLETE_STG = Cta_tile::M % ROWS_PER_STG != 0 };
+
+  // The number of STGs needed to store a chunk of the Q matrix.
+  enum { STGS_PER_LOOP = fmha::Div_up<ROWS_PER_LOOP, ROWS_PER_STG>::VALUE };
+
+  // The number of STGs needed to store a chunk of the Q matrix in total.
+  enum { STGS = STGS_PER_LOOP * LOOPS };
+
+#if 0
+    // The number of "rows" stored per iteration of the loop. The output of 1 MMA.
+    enum { ROWS = Cta_tile::M };
+    // The number of "rows" stored per iteration of the loop. The output of 1 MMA.
+    enum { ROWS_PER_LOOP = Mma_tile::M_PER_MMA_PER_CTA };
+    // The number of outer loop for the stores.
+    enum { LOOPS = ROWS / ROWS_PER_LOOP };
+
+    // Make sure the math is correct.
+    static_assert(LOOPS == (int)Mma_tile::MMAS_M, "");
+
+    // The number of "rows" stored per STG -- for it to be the number of rows per MMA instruction.
+    enum { ROWS_PER_STG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+    // The number of STGs needed to store a chunk of the Q matrix.
+    enum { STGS_PER_LOOP = fmha::Div_up<ROWS_PER_LOOP, ROWS_PER_STG>::VALUE };
+#endif
+
+  // Are all threads active?
+  enum { ALL_THREADS_ACTIVE = ROWS_PER_STG <= ROWS_PER_LOOP };
+
+  // The number of active threads.
+  enum { ACTIVE_THREADS_ = Cta_tile::THREADS_PER_CTA * ROWS_PER_LOOP / ROWS_PER_STG };
+
+  // The number of active threads.
+  enum { ACTIVE_THREADS = ALL_THREADS_ACTIVE ? Cta_tile::THREADS_PER_CTA : ACTIVE_THREADS_ };
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o_8bit(Params const& params, Block_info const& block_info, int tidx,
+                                     int cta_row_offset = 0, int cta_col_offset_in_bytes = 0)
+      : params_o_stride_in_bytes_(params.o_stride_in_bytes),
+        actual_seqlen_(block_info.actual_q_seqlen),
+        params_scale_bmm2_(params.scale_bmm2_d ? *params.scale_bmm2_d : params.scale_bmm2)
+#ifdef GENERATE_CUBIN
+        ,
+        params_enable_i2f_trick_(false)
+#else
+        ,
+        params_enable_i2f_trick_(params.enable_i2f_trick)
+#endif
+        ,
+        o_ptr_(reinterpret_cast<char*>(params.o_ptr))
+#if USE_DEMO_BERT_PARAMS
+        ,
+        o_scratch_ptr_(nullptr) {
+#else
+        ,
+        o_scratch_ptr_(reinterpret_cast<uint4*>(params.o_scratch_ptr)) {
+#endif
+
+    // Compute the position in the sequence (within the CTA for the moment).
+    int row = tidx / THREADS_PER_ROW;
+    // Compute the position of the thread in the row.
+    int col = tidx % THREADS_PER_ROW;
+
+    // Is it an active thread for the very last STG?
+    if (HAS_INCOMPLETE_STG) {
+      is_active_for_last_stg_ = row + (STGS - 1) * ROWS_PER_STG < Cta_tile::M;
+    }
+
+    // Store the row to check against the length before loads.
+    row_ = cta_row_offset + row;
+    col_in_bytes_ = cta_col_offset_in_bytes + col * BYTES_PER_STG;
+
+    // The row offset in the batched GEMM.
+    int64_t row_offset = (int64_t)row_ * params.o_stride_in_bytes;
+    // The amount of bytes per row without padding (runtime).
+    int const valid_bytes_per_row = params.dv * BYTES_PER_ELEMENT;
+    // Take the batch/head offset into account.
+    row_offset += block_info.bidx * valid_bytes_per_row;
+    // Assemble the final pointer.
+    o_ptr_ += row_offset + col_in_bytes_;
+
+    // Is it an active thread?
+    is_active_ = ALL_THREADS_ACTIVE || (row < ROWS_PER_LOOP && col_in_bytes_ < VALID_BYTES_PER_ROW);
+
+    // Do not store if the thread is in the padded area
+    is_active_ = is_active_ && col < valid_bytes_per_row / BYTES_PER_STG;
+
+    // For the scratch space, the pointer has int32 type so it accounts for the *4 factor.
+    o_scratch_ptr_ += blockIdx.y * STGS_PER_LOOP * ACTIVE_THREADS + tidx;
+  }
+
+  // Load data from global memory.
+  inline __device__ void load(uint4 (&dst)[STGS_PER_LOOP], int mi) {
+    if (blockIdx.x == 0) {
+#pragma unroll
+      for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+        dst[ii] = make_uint4(0u, 0u, 0u, 0u);
+      }
+    } else if (ALL_THREADS_ACTIVE || is_active_) {
+#pragma unroll
+      for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+        fmha::ldg(dst[ii], o_scratch_ptr_ + ii * ACTIVE_THREADS);
+      }
+    }
+  }
+
+  // Store data to global memory.
+  inline __device__ void store(uint4 const (&src)[STGS_PER_LOOP], int mi) {
+// Iterate over the different STGs.
+#pragma unroll
+    for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+      // Break early if we exceed s_i...
+      int jj = mi * STGS_PER_LOOP + ii;
+      if (row_ + jj * ROWS_PER_STG >= actual_seqlen_) {
+        return;
+      }
+      using Src_type = typename Traits::Accumulator_type;
+      using Dst_type = typename Traits::C_type;
+      // Packs the 32bit/16bit values to 8bit.
+      // Depending on the type, applies extra scaling with parameter scale_bmm2.
+      Stg_packed_type dst = Acc_packer<Src_type, Dst_type>::run(this, src[ii]);
+      float const* row_ptr = reinterpret_cast<float const*>(&src[ii]);
+
+      // Store the result.
+      if (is_active_ && (!HAS_INCOMPLETE_STG || (jj < STGS - 1 || is_active_for_last_stg_))) {
+        fmha::stg(o_ptr_ + jj * ROWS_PER_STG * params_o_stride_in_bytes_, dst);
+      }
+    }
+  }
+
+  // Store data to global memory.
+  // TODO: 16bit (half)
+  inline __device__ void store(uint4 const (&src)[STGS_PER_LOOP], uint4 const (&old)[STGS_PER_LOOP],
+                               int mi) {
+    // Do the reduction.
+    uint4 tmp[STGS_PER_LOOP];
+#if defined(USE_I2F_EMULATION_TRICK)
+    if (params_enable_i2f_trick_) {
+#pragma unroll
+      for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+        float4 const& src_ii = reinterpret_cast<float4 const&>(src[ii]);
+        float4 const& old_ii = reinterpret_cast<float4 const&>(old[ii]);
+
+        float x = src_ii.x + old_ii.x;
+        float y = src_ii.y + old_ii.y;
+        float z = src_ii.z + old_ii.z;
+        float w = src_ii.w + old_ii.w;
+
+        tmp[ii].x = reinterpret_cast<uint32_t const&>(x);
+        tmp[ii].y = reinterpret_cast<uint32_t const&>(y);
+        tmp[ii].z = reinterpret_cast<uint32_t const&>(z);
+        tmp[ii].w = reinterpret_cast<uint32_t const&>(w);
+      }
+    } else
+#endif
+    {
+#pragma unroll
+      for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+        int4 const& src_ii = reinterpret_cast<int4 const&>(src[ii]);
+        int4 const& old_ii = reinterpret_cast<int4 const&>(old[ii]);
+
+        int32_t x = src_ii.x + old_ii.x;
+        int32_t y = src_ii.y + old_ii.y;
+        int32_t z = src_ii.z + old_ii.z;
+        int32_t w = src_ii.w + old_ii.w;
+
+        tmp[ii].x = reinterpret_cast<uint32_t const&>(x);
+        tmp[ii].y = reinterpret_cast<uint32_t const&>(y);
+        tmp[ii].z = reinterpret_cast<uint32_t const&>(z);
+        tmp[ii].w = reinterpret_cast<uint32_t const&>(w);
+      }
+    }
+
+    // The last CTA stores INT8 values to the final location.
+    if (blockIdx.x == CTAS_PER_HEAD - 1) {
+      this->store(tmp, mi);
+
+      // Other CTAs store INT32 values to the scratch space.
+    } else if (ALL_THREADS_ACTIVE || is_active_) {
+#pragma unroll
+      for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+        fmha::stg(o_scratch_ptr_ + ii * ACTIVE_THREADS, tmp[ii]);
+      }
+    }
+  }
+
+  // Move the pointer.
+  inline __device__ void move() {
+    row_ += ROWS;
+    o_ptr_ += (int64_t)ROWS * params_o_stride_in_bytes_;
+  }
+
+  // The stride between rows for the QKV matrice.
+  int64_t params_o_stride_in_bytes_;
+  // The scaling factor to convert to int8.
+  uint32_t const params_scale_bmm2_;
+  // Do we enable the i2f trick?
+  bool const params_enable_i2f_trick_;
+  // The pointer.
+  char* o_ptr_;
+  // The pointer to the scratch space to do the reduction (for CTAS_PER_HEAD > 1).
+  uint4* o_scratch_ptr_;
+  // The row, col stored by this thread (i.e. the position in that sequence).
+  int row_, col_in_bytes_;
+  // The size of the sequence length computed by that CTA.
+  int actual_seqlen_;
+
+  // Is it an active thread?
+  int is_active_, is_active_for_last_stg_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Volta_imma_int8_int32_traits, Cta_tile, CTAS_PER_HEAD>
+    : public Gmem_tile_o_8bit<fmha::Volta_imma_int8_int32_traits, Cta_tile, CTAS_PER_HEAD> {
+  // The traits class.
+  using Traits = fmha::Volta_imma_int8_int32_traits;
+  // The base class.
+  using Base = Gmem_tile_o_8bit<Traits, Cta_tile, CTAS_PER_HEAD>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& block_info, int tidx,
+                                int cta_row_offset = 0, int cta_col_offset_in_bytes = 0)
+      : Base(params, block_info, tidx, cta_row_offset, cta_col_offset_in_bytes) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Turing_imma_int8_int32_traits, Cta_tile, CTAS_PER_HEAD>
+    : public Gmem_tile_o_8bit<fmha::Turing_imma_int8_int32_traits, Cta_tile, CTAS_PER_HEAD> {
+  // The traits class.
+  using Traits = fmha::Turing_imma_int8_int32_traits;
+  // The base class.
+  using Base = Gmem_tile_o_8bit<Traits, Cta_tile, CTAS_PER_HEAD>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& block_info, int tidx,
+                                int cta_row_offset = 0, int cta_col_offset_in_bytes = 0)
+      : Base(params, block_info, tidx, cta_row_offset, cta_col_offset_in_bytes) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Ampere_imma_int8_int32_traits, Cta_tile, CTAS_PER_HEAD>
+    : public Gmem_tile_o_8bit<fmha::Ampere_imma_int8_int32_traits, Cta_tile, CTAS_PER_HEAD> {
+  // The traits class.
+  using Traits = fmha::Ampere_imma_int8_int32_traits;
+  // The base class.
+  using Base = Gmem_tile_o_8bit<Traits, Cta_tile, CTAS_PER_HEAD>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& block_info, int tidx,
+                                int cta_row_offset = 0, int cta_col_offset_in_bytes = 0)
+      : Base(params, block_info, tidx, cta_row_offset, cta_col_offset_in_bytes) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Ada_qmma_e4m3_fp32_traits, Cta_tile, CTAS_PER_HEAD>
+    : public Gmem_tile_o_8bit<fmha::Ada_qmma_e4m3_fp32_traits, Cta_tile, CTAS_PER_HEAD> {
+  // The traits class.
+  using Traits = fmha::Ada_qmma_e4m3_fp32_traits;
+  // The base class.
+  using Base = Gmem_tile_o_8bit<Traits, Cta_tile, CTAS_PER_HEAD>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& block_info, int tidx,
+                                int cta_row_offset = 0, int cta_col_offset_in_bytes = 0)
+      : Base(params, block_info, tidx, cta_row_offset, cta_col_offset_in_bytes) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Ada_qmma_e4m3_fp16_traits, Cta_tile, CTAS_PER_HEAD>
+    : public Gmem_tile_o_8bit<fmha::Ada_qmma_e4m3_fp16_traits, Cta_tile, CTAS_PER_HEAD> {
+  // The traits class.
+  using Traits = fmha::Ada_qmma_e4m3_fp16_traits;
+  // The base class.
+  using Base = Gmem_tile_o_8bit<Traits, Cta_tile, CTAS_PER_HEAD>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& block_info, int tidx,
+                                int cta_row_offset = 0, int cta_col_offset_in_bytes = 0)
+      : Base(params, block_info, tidx, cta_row_offset, cta_col_offset_in_bytes) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, typename Output_type, int CTAS_PER_HEAD>
+struct Gmem_tile_o_16bit {
+  // This stores the fp32 accumulators of Ada_qmma_e4m3_fp32_traits as 16bit values to
+  // the global memory.
+
+  static_assert(std::is_same<Traits, fmha::Ada_qmma_e4m3_fp32_traits>::value);
+  static_assert(std::is_same<Output_type, uint16_t>::value ||
+                std::is_same<Output_type, nv_bfloat16>::value);
+
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The size of each element.
+  enum { BYTES_PER_ELEMENT = 2 };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = Cta_tile::N * BYTES_PER_ELEMENT };
+
+  // The valid size of a row in bytes.
+  // Note: cross-attention kernels rely on head dim from runtime instead of from compile-time.
+  // This approach deviates from self-attention kernels. To explore a unified approach.
+  enum { VALID_BYTES_PER_ROW = Cta_tile::VALID_N * BYTES_PER_ELEMENT };
+
+  // The size of each STG.
+  enum { BYTES_PER_STG = 8 };
+
+  // The STG packed data type
+  using Stg_packed_type = typename Uint_from_size_in_bytes<BYTES_PER_STG>::Type;
+
+  // The number of threads to store a "row" of the matrix.
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_STG };
+
+  // The number of "rows" stored per STG.
+  enum { ROWS_PER_STG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // The number of "rows" stored per iteration of the loop. The output of 1 MMA.
+  enum { ROWS = Cta_tile::M };
+
+  // We want at least one output per thread (if possible).
+  enum { ROWS_PER_LOOP_ = ROWS <= 64 ? ROWS : (int)Min<ROWS, ROWS_PER_STG>::VALUE };
+
+  // We also want to have "complete" MMAs.
+  enum { ROWS_PER_LOOP = Max<ROWS_PER_LOOP_, Mma_tile::M_PER_MMA_PER_CTA>::VALUE };
+
+  // The number of outer loop for the stores.
+  enum { LOOPS = fmha::Div_up<ROWS, ROWS_PER_LOOP>::VALUE };
+
+  // DEBUG.
+  static_assert(ROWS % ROWS_PER_LOOP == 0, "");
+  // END OF DEBUG.
+
+  // Make sure the math is correct.
+  static_assert(ROWS_PER_LOOP >= (int)Mma_tile::M_PER_MMA_PER_CTA, "");
+
+  // Do we have to guard against partial writes/reads.
+  enum { HAS_INCOMPLETE_STG = Cta_tile::M % ROWS_PER_STG != 0 };
+
+  // The number of STGs needed to store a chunk of the Q matrix.
+  enum { STGS_PER_LOOP = fmha::Div_up<ROWS_PER_LOOP, ROWS_PER_STG>::VALUE };
+
+  // The number of STGs needed to store a chunk of the Q matrix in total.
+  enum { STGS = STGS_PER_LOOP * LOOPS };
+
+  // Are all threads active?
+  enum { ALL_THREADS_ACTIVE = ROWS_PER_STG <= ROWS_PER_LOOP };
+
+  // The number of active threads.
+  enum { ACTIVE_THREADS_ = Cta_tile::THREADS_PER_CTA * ROWS_PER_LOOP / ROWS_PER_STG };
+
+  // The number of active threads.
+  enum { ACTIVE_THREADS = ALL_THREADS_ACTIVE ? Cta_tile::THREADS_PER_CTA : ACTIVE_THREADS_ };
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o_16bit(Params const& params, Block_info const& block_info, int tidx,
+                                      int cta_row_offset = 0, int cta_col_offset_in_bytes = 0)
+      : params_o_stride_in_bytes_(params.o_stride_in_bytes),
+        actual_seqlen_(block_info.actual_q_seqlen),
+        params_scale_bmm2_(params.scale_bmm2_d ? *params.scale_bmm2_d : params.scale_bmm2)
+#ifdef GENERATE_CUBIN
+        ,
+        params_enable_i2f_trick_(false)
+#else
+        ,
+        params_enable_i2f_trick_(params.enable_i2f_trick)
+#endif
+        ,
+        o_ptr_(reinterpret_cast<char*>(params.o_ptr))
+#if USE_DEMO_BERT_PARAMS
+        ,
+        o_scratch_ptr_(nullptr) {
+#else
+        ,
+        o_scratch_ptr_(reinterpret_cast<uint4*>(params.o_scratch_ptr)) {
+#endif
+
+    // Compute the position in the sequence (within the CTA for the moment).
+    int row = tidx / THREADS_PER_ROW;
+    // Compute the position of the thread in the row.
+    int col = tidx % THREADS_PER_ROW;
+
+    // Is it an active thread for the very last STG?
+    if (HAS_INCOMPLETE_STG) {
+      is_active_for_last_stg_ = row + (STGS - 1) * ROWS_PER_STG < Cta_tile::M;
+    }
+
+    // Store the row to check against the length before loads.
+    row_ = cta_row_offset + row;
+    col_in_bytes_ = cta_col_offset_in_bytes + col * BYTES_PER_STG;
+
+    // The row offset in the batched GEMM.
+    int64_t row_offset = (int64_t)row_ * params.o_stride_in_bytes;
+    // The amount of bytes per row without padding (runtime).
+    int const valid_bytes_per_row = params.dv * BYTES_PER_ELEMENT;
+    // Take the batch/head offset into account.
+    row_offset += block_info.bidx * valid_bytes_per_row;
+    // Assemble the final pointer.
+    o_ptr_ += row_offset + col_in_bytes_;
+
+    // Is it an active thread?
+    is_active_ = ALL_THREADS_ACTIVE || (row < ROWS_PER_LOOP && col_in_bytes_ < VALID_BYTES_PER_ROW);
+
+    // Do not store if the thread is in the padded area
+    is_active_ = is_active_ && col < valid_bytes_per_row / BYTES_PER_STG;
+
+    // For the scratch space, the pointer has int32 type so it accounts for the *4 factor.
+    o_scratch_ptr_ += blockIdx.y * STGS_PER_LOOP * ACTIVE_THREADS + tidx;
+  }
+
+  // Store data to global memory.
+  inline __device__ void store(uint4 const (&src)[STGS_PER_LOOP], int mi) {
+// Iterate over the different STGs.
+#pragma unroll
+    for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+      // Break early if we exceed s_i...
+      int jj = mi * STGS_PER_LOOP + ii;
+      if (row_ + jj * ROWS_PER_STG >= actual_seqlen_) {
+        return;
+      }
+      using Src_type = typename Traits::Accumulator_type;
+      // Packs the 32bit/16bit values to 16bit.
+      // Depending on the type, applies extra scaling with parameter scale_bmm2.
+      Stg_packed_type dst = Acc_packer<Src_type, Output_type>::run(this, src[ii]);
+      float const* row_ptr = reinterpret_cast<float const*>(&src[ii]);
+
+      // Store the result.
+      if (is_active_ && (!HAS_INCOMPLETE_STG || (jj < STGS - 1 || is_active_for_last_stg_))) {
+        fmha::stg(o_ptr_ + jj * ROWS_PER_STG * params_o_stride_in_bytes_, dst);
+      }
+    }
+  }
+
+  // The stride between rows for the QKV matrice.
+  int64_t params_o_stride_in_bytes_;
+  // The scaling factor to convert to int8.
+  uint32_t const params_scale_bmm2_;
+  // Do we enable the i2f trick?
+  bool const params_enable_i2f_trick_;
+  // The pointer.
+  char* o_ptr_;
+  // The pointer to the scratch space to do the reduction (for CTAS_PER_HEAD > 1).
+  uint4* o_scratch_ptr_;
+  // The row, col stored by this thread (i.e. the position in that sequence).
+  int row_, col_in_bytes_;
+  // The size of the sequence length computed by that CTA.
+  int actual_seqlen_;
+
+  // Is it an active thread?
+  int is_active_, is_active_for_last_stg_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o_uint16 : public Gmem_tile_o_16bit<Traits, Cta_tile, uint16_t, CTAS_PER_HEAD> {
+  using Base = Gmem_tile_o_16bit<Traits, Cta_tile, uint16_t, CTAS_PER_HEAD>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o_uint16(Params const& params, Block_info const& block_info, int tidx,
+                                       int cta_row_offset = 0, int cta_col_offset_in_bytes = 0)
+      : Base(params, block_info, tidx, cta_row_offset, cta_col_offset_in_bytes) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int CTAS_PER_HEAD>
+struct Gmem_tile_o_bfloat16
+    : public Gmem_tile_o_16bit<Traits, Cta_tile, nv_bfloat16, CTAS_PER_HEAD> {
+  using Base = Gmem_tile_o_16bit<Traits, Cta_tile, nv_bfloat16, CTAS_PER_HEAD>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o_bfloat16(Params const& params, Block_info const& block_info,
+                                         int tidx, int cta_row_offset = 0,
+                                         int cta_col_offset_in_bytes = 0)
+      : Base(params, block_info, tidx, cta_row_offset, cta_col_offset_in_bytes) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int CTAS_PER_HEAD>
+struct Imma_gmem_tile_o_interleaved {
+  // The mma tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  enum { VEC = 32 };
+
+  enum { NUM_SLICES = Cta_tile::N / VEC };
+
+  // DEBUG.
+  static_assert(NUM_SLICES == 1 || NUM_SLICES == 2, "");
+
+  // END OF DEBUG.
+
+  // The size of each element.
+  enum { BYTES_PER_ELEMENT = 1 };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = VEC * BYTES_PER_ELEMENT };
+
+  // The size of each STG.
+  enum { BYTES_PER_STG = 4 };
+
+  // The number of threads to store a "row" of the matrix. We force it to 8
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_STG };
+
+  // DEBUG.
+  static_assert(THREADS_PER_ROW == 8 && BYTES_PER_STG == 4, "");
+
+  // END OF DEBUG.
+
+  // the "logical" number of rows. think of rows per slice
+  enum { ROWS = Cta_tile::M };
+
+  // "physical" rows
+  enum { TOTAL_ROWS = ROWS * NUM_SLICES };
+
+  // The number of "rows" stored per iteration of the loop. The output of 1 MMA.
+  enum { ROWS_PER_LOOP_PER_SLICE = Mma_tile::M_PER_MMA_PER_CTA };
+
+  enum { ROWS_PER_LOOP = Mma_tile::M_PER_MMA_PER_CTA * NUM_SLICES };
+
+  // DEBUG.
+  static_assert(ROWS_PER_LOOP == 16 * Cta_tile::WARPS_M * NUM_SLICES, "");
+
+  // END OF DEBUG.
+
+  // The number of outer loop for the stores.
+  enum { LOOPS = TOTAL_ROWS / ROWS_PER_LOOP };
+
+  // Make sure the math is correct.
+  static_assert(LOOPS == (int)Mma_tile::MMAS_M, "");
+
+  // The number of "rows" stored per STG -- for it to be the number of rows per MMA instruction.
+  enum { ROWS_PER_STG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // The number of STGs needed to store a chunk of the Q matrix.
+  enum { STGS_PER_LOOP = fmha::Div_up<ROWS_PER_LOOP, ROWS_PER_STG>::VALUE };
+
+  enum { STGS_PER_SLICE = STGS_PER_LOOP / NUM_SLICES };
+
+  // DEBUG.
+  static_assert((Cta_tile::WARPS_M == 1 && STGS_PER_SLICE == 1) ||
+                    (Cta_tile::WARPS_M == 2 && STGS_PER_SLICE == 2),
+                "");
+
+  // END OF DEBUG.
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Imma_gmem_tile_o_interleaved(Params const& params, Block_info const& block_info,
+                                                 int tidx, int cta_row_offset = 0)
+      : params_o_stride_in_bytes_(params.o_stride_in_bytes),
+        actual_seqlen_(block_info.actual_seqlen - cta_row_offset),
+        params_scale_bmm2_(params.scale_bmm2),
+        params_enable_i2f_trick_(params.enable_i2f_trick),
+        o_ptr_(reinterpret_cast<char*>(params.o_ptr)),
+        total_(params.o_stride_in_bytes) {
+    int bidh = block_info.bidh;
+    int sum_s = block_info.sum_s;
+
+    row_ = tidx / THREADS_PER_ROW;
+    int col = tidx % THREADS_PER_ROW;
+
+    // h is N
+    // d is H
+    // want to save as: h x (d/32) x total x 32 (think 3 x h x (d/32) x b x s x 32)
+
+    int block_offset = bidh * NUM_SLICES * total_ + sum_s;  // bidh * GROUPS * B * S + b * S
+    int row_offset = (block_offset + cta_row_offset) * BYTES_PER_ROW;
+
+    o_ptr_ += row_offset + col * BYTES_PER_STG;
+  }
+
+  // Load data from global memory.
+  inline __device__ void load(uint4 (&dst)[STGS_PER_LOOP], int mi) {
+    static_assert(CTAS_PER_HEAD == 1, "Not implemented");
+  }
+
+  // Store data to global memory.
+  inline __device__ void store(uint4 const (&src)[STGS_PER_LOOP], int mi) {
+    int rows_so_far = mi * STGS_PER_LOOP * ROWS_PER_STG;
+    int rows_so_far_per_slice = rows_so_far / 2;
+
+    // The scale.
+    float const& scale = reinterpret_cast<float const&>(params_scale_bmm2_);
+
+// Iterate over the different STGs.
+#pragma unroll
+    for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+      // if(ii == 1) return;
+      // decompose the iteration into slice
+      int slice = ii / STGS_PER_SLICE;
+      int si = ii % STGS_PER_SLICE;
+      // dbg 256
+      //      assert(STGS_PER_SLICE == 1);
+      //      assert(STGS_PER_LOOP == 2);
+      //      assert(slice == ii);
+      // the number of rows one CTA-wide STG writes
+      static_assert(ROWS_PER_STG == 16, "");  // only holds for 4 warps/128 threads
+      int row_in_slice = row_ + si * ROWS_PER_STG + rows_so_far_per_slice;
+
+      // we cannot return early, because the second half of iterates are
+      // responsible for the bottom slice
+      if (row_in_slice >= min(actual_seqlen_, ROWS)) {
+        continue;
+      }
+
+      int offset = (slice * total_ + row_in_slice) * BYTES_PER_ROW;
+
+      // The accumulators are in int32_t.
+      int4 const& val = reinterpret_cast<int4 const&>(src[ii]);
+
+      //      if(threadIdx.x == 96){
+      //      printf("mi=%d ii=%d S=%d si=%d sofar=%d row=%d as=%d\n", mi, ii, slice, si,
+      //      rows_so_far_per_slice, row_in_slice, actual_seqlen_)  ;
+      //      }
+
+      uint32_t dst = quantize(val, scale, params_enable_i2f_trick_);
+      // Store the result.
+      fmha::stg(o_ptr_ + offset, dst);
+    }
+  }
+
+  // Store data to global memory.
+  inline __device__ void store(uint4 const (&src)[STGS_PER_LOOP], uint4 const (&old)[STGS_PER_LOOP],
+                               int mi) {
+    static_assert(CTAS_PER_HEAD == 1, "Not implemented");
+  }
+
+  // Move the pointer.
+  inline __device__ void move() {
+    o_ptr_ += (int64_t)ROWS * BYTES_PER_ROW;
+    actual_seqlen_ -= ROWS;
+  }
+
+  // The stride between rows for the QKV matrice.
+  int64_t const params_o_stride_in_bytes_;
+  // The scaling factor to convert to int8.
+  uint32_t const params_scale_bmm2_;
+  // Do we enable the i2f trick?
+  bool const params_enable_i2f_trick_;
+  // The pointer.
+  char* o_ptr_;
+  int row_;
+  int actual_seqlen_;
+  int total_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace v2
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/gmem_tile_ps.h b/csrc/fmha_v2/fmha/gmem_tile_ps.h
new file mode 100644
index 0000000000..de150ff293
--- /dev/null
+++ b/csrc/fmha_v2/fmha/gmem_tile_ps.h
@@ -0,0 +1,837 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+#include <fmha/hopper/fragment.h>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, int BITS_PER_ELEMENT>
+struct Store_accumulator {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits>
+struct Store_accumulator<Traits, 16> {
+  // The fragment.
+  using Acc = fmha::Fragment_accumulator<Traits>;
+
+  // Store.
+  inline __device__ void store(char* ptr, int64_t step_m, int64_t step_n, Acc const& acc,
+                               uint32_t scale) {
+    uint32_t acc_0 = fmha::hmul2(acc.reg(0), scale);
+    uint32_t acc_1 = fmha::hmul2(acc.reg(1), scale);
+    uint32_t acc_2 = fmha::hmul2(acc.reg(2), scale);
+    uint32_t acc_3 = fmha::hmul2(acc.reg(3), scale);
+
+    fmha::stg(ptr + 0 * step_m + 0 * step_n, acc_0);
+    fmha::stg(ptr + 1 * step_m + 0 * step_n, acc_1);
+    fmha::stg(ptr + 0 * step_m + 1 * step_n, acc_2);
+    fmha::stg(ptr + 1 * step_m + 1 * step_n, acc_3);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits>
+struct Store_accumulator<Traits, 32> {
+  // The fragment.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // Store.
+  inline __device__ void store(char* ptr, int64_t step_m, int64_t step_n, Accumulator const& acc,
+                               uint32_t) {
+    int32_t tmp_0 = acc.elt(0);
+    int32_t tmp_1 = acc.elt(1);
+    int32_t tmp_2 = acc.elt(2);
+    int32_t tmp_3 = acc.elt(3);
+    int32_t tmp_4 = acc.elt(4);
+    int32_t tmp_5 = acc.elt(5);
+    int32_t tmp_6 = acc.elt(6);
+    int32_t tmp_7 = acc.elt(7);
+
+#if defined(USE_I2F_EMULATION_TRICK)
+    tmp_0 -= int32_t(FP32_I2F_MAGIC_NUMBER_HEX);
+    tmp_1 -= int32_t(FP32_I2F_MAGIC_NUMBER_HEX);
+    tmp_2 -= int32_t(FP32_I2F_MAGIC_NUMBER_HEX);
+    tmp_3 -= int32_t(FP32_I2F_MAGIC_NUMBER_HEX);
+    tmp_4 -= int32_t(FP32_I2F_MAGIC_NUMBER_HEX);
+    tmp_5 -= int32_t(FP32_I2F_MAGIC_NUMBER_HEX);
+    tmp_6 -= int32_t(FP32_I2F_MAGIC_NUMBER_HEX);
+    tmp_7 -= int32_t(FP32_I2F_MAGIC_NUMBER_HEX);
+#endif
+
+    uint32_t acc_0 = reinterpret_cast<uint32_t const&>(tmp_0);
+    uint32_t acc_1 = reinterpret_cast<uint32_t const&>(tmp_1);
+    uint32_t acc_2 = reinterpret_cast<uint32_t const&>(tmp_2);
+    uint32_t acc_3 = reinterpret_cast<uint32_t const&>(tmp_3);
+    uint32_t acc_4 = reinterpret_cast<uint32_t const&>(tmp_4);
+    uint32_t acc_5 = reinterpret_cast<uint32_t const&>(tmp_5);
+    uint32_t acc_6 = reinterpret_cast<uint32_t const&>(tmp_6);
+    uint32_t acc_7 = reinterpret_cast<uint32_t const&>(tmp_7);
+
+    fmha::stg(ptr + 0 * step_m + 0 * step_n, make_uint2(acc_0, acc_1));
+    fmha::stg(ptr + 1 * step_m + 0 * step_n, make_uint2(acc_4, acc_5));
+    fmha::stg(ptr + 0 * step_m + 1 * step_n, make_uint2(acc_2, acc_3));
+    fmha::stg(ptr + 1 * step_m + 1 * step_n, make_uint2(acc_6, acc_7));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Store_accumulator<Ampere_hmma_fp32_traits, 32> {
+  // The instruction traits.
+  using Traits = Ampere_hmma_fp32_traits;
+  // The fragment.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // Store.
+  inline __device__ void store(char* ptr, int64_t step_m, int64_t step_n, Accumulator const& acc,
+                               uint32_t scale) {
+    float const scalef = reinterpret_cast<float const&>(scale);
+
+    float const tmp_0 = acc.elt(0) * scalef;
+    float const tmp_1 = acc.elt(1) * scalef;
+    float const tmp_2 = acc.elt(2) * scalef;
+    float const tmp_3 = acc.elt(3) * scalef;
+    float const tmp_4 = acc.elt(4) * scalef;
+    float const tmp_5 = acc.elt(5) * scalef;
+    float const tmp_6 = acc.elt(6) * scalef;
+    float const tmp_7 = acc.elt(7) * scalef;
+
+    uint32_t acc_0 = reinterpret_cast<uint32_t const&>(tmp_0);
+    uint32_t acc_1 = reinterpret_cast<uint32_t const&>(tmp_1);
+    uint32_t acc_2 = reinterpret_cast<uint32_t const&>(tmp_2);
+    uint32_t acc_3 = reinterpret_cast<uint32_t const&>(tmp_3);
+    uint32_t acc_4 = reinterpret_cast<uint32_t const&>(tmp_4);
+    uint32_t acc_5 = reinterpret_cast<uint32_t const&>(tmp_5);
+    uint32_t acc_6 = reinterpret_cast<uint32_t const&>(tmp_6);
+    uint32_t acc_7 = reinterpret_cast<uint32_t const&>(tmp_7);
+
+    fmha::stg(ptr + 0 * step_m + 0 * step_n, make_uint2(acc_0, acc_1));
+    fmha::stg(ptr + 1 * step_m + 0 * step_n, make_uint2(acc_2, acc_3));
+    fmha::stg(ptr + 0 * step_m + 1 * step_n, make_uint2(acc_4, acc_5));
+    fmha::stg(ptr + 1 * step_m + 1 * step_n, make_uint2(acc_6, acc_7));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Store_accumulator<Ampere_hmma_bf16_traits, 32> {
+  // The instruction traits.
+  using Traits = Ampere_hmma_bf16_traits;
+  // The fragment.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // Store.
+  inline __device__ void store(char* ptr, int64_t step_m, int64_t step_n, Accumulator const& acc,
+                               uint32_t scale) {
+    float const scalef = reinterpret_cast<float const&>(scale);
+
+    float const tmp_0 = acc.elt(0) * scalef;
+    float const tmp_1 = acc.elt(1) * scalef;
+    float const tmp_2 = acc.elt(2) * scalef;
+    float const tmp_3 = acc.elt(3) * scalef;
+    float const tmp_4 = acc.elt(4) * scalef;
+    float const tmp_5 = acc.elt(5) * scalef;
+    float const tmp_6 = acc.elt(6) * scalef;
+    float const tmp_7 = acc.elt(7) * scalef;
+
+    uint32_t acc_0 = reinterpret_cast<uint32_t const&>(tmp_0);
+    uint32_t acc_1 = reinterpret_cast<uint32_t const&>(tmp_1);
+    uint32_t acc_2 = reinterpret_cast<uint32_t const&>(tmp_2);
+    uint32_t acc_3 = reinterpret_cast<uint32_t const&>(tmp_3);
+    uint32_t acc_4 = reinterpret_cast<uint32_t const&>(tmp_4);
+    uint32_t acc_5 = reinterpret_cast<uint32_t const&>(tmp_5);
+    uint32_t acc_6 = reinterpret_cast<uint32_t const&>(tmp_6);
+    uint32_t acc_7 = reinterpret_cast<uint32_t const&>(tmp_7);
+
+    fmha::stg(ptr + 0 * step_m + 0 * step_n, make_uint2(acc_0, acc_1));
+    fmha::stg(ptr + 1 * step_m + 0 * step_n, make_uint2(acc_2, acc_3));
+    fmha::stg(ptr + 0 * step_m + 1 * step_n, make_uint2(acc_4, acc_5));
+    fmha::stg(ptr + 1 * step_m + 1 * step_n, make_uint2(acc_6, acc_7));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint16_t pack_char2(uint32_t a, uint32_t b) {
+  uint32_t dst;
+  asm volatile("prmt.b32 %0, %1, %2, 0x0040;\n" : "=r"(dst) : "r"(a), "r"(b));
+  return reinterpret_cast<uint16_t&>(dst);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits>
+struct Store_accumulator<Traits, 8> {
+  // The fragment.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // Store.
+  inline __device__ void store(char* ptr, int64_t step_m, int64_t step_n, Accumulator const& acc,
+                               uint32_t) {
+    // Pack pairs of values.
+    uint16_t tmp_00 = pack_char2(acc.reg(0), acc.reg(1));
+    uint16_t tmp_01 = pack_char2(acc.reg(2), acc.reg(3));
+    uint16_t tmp_10 = pack_char2(acc.reg(4), acc.reg(5));
+    uint16_t tmp_11 = pack_char2(acc.reg(6), acc.reg(7));
+
+    // Store to memory.
+    fmha::stg(ptr + 0 * step_m + 0 * step_n, tmp_00);
+    fmha::stg(ptr + 1 * step_m + 0 * step_n, tmp_10);
+    fmha::stg(ptr + 0 * step_m + 1 * step_n, tmp_01);
+    fmha::stg(ptr + 1 * step_m + 1 * step_n, tmp_11);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Store_accumulator<fmha::Ampere_imma_int8_int32_traits, 32> {
+  // The traits.
+  using Traits = fmha::Ampere_imma_int8_int32_traits;
+  // The fragment.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // Store.
+  inline __device__ void store(char* ptr, int64_t step_m, int64_t step_n, Accumulator const& acc,
+                               uint32_t) {
+    int32_t tmp_0 = acc.elt(0);
+    int32_t tmp_1 = acc.elt(1);
+    int32_t tmp_2 = acc.elt(2);
+    int32_t tmp_3 = acc.elt(3);
+    int32_t tmp_4 = acc.elt(4);
+    int32_t tmp_5 = acc.elt(5);
+    int32_t tmp_6 = acc.elt(6);
+    int32_t tmp_7 = acc.elt(7);
+
+#if defined(USE_I2F_EMULATION_TRICK)
+    tmp_0 -= int32_t(FP32_I2F_MAGIC_NUMBER_HEX);
+    tmp_1 -= int32_t(FP32_I2F_MAGIC_NUMBER_HEX);
+    tmp_2 -= int32_t(FP32_I2F_MAGIC_NUMBER_HEX);
+    tmp_3 -= int32_t(FP32_I2F_MAGIC_NUMBER_HEX);
+    tmp_4 -= int32_t(FP32_I2F_MAGIC_NUMBER_HEX);
+    tmp_5 -= int32_t(FP32_I2F_MAGIC_NUMBER_HEX);
+    tmp_6 -= int32_t(FP32_I2F_MAGIC_NUMBER_HEX);
+    tmp_7 -= int32_t(FP32_I2F_MAGIC_NUMBER_HEX);
+#endif
+
+    uint32_t acc_0 = reinterpret_cast<uint32_t const&>(tmp_0);
+    uint32_t acc_1 = reinterpret_cast<uint32_t const&>(tmp_1);
+    uint32_t acc_2 = reinterpret_cast<uint32_t const&>(tmp_2);
+    uint32_t acc_3 = reinterpret_cast<uint32_t const&>(tmp_3);
+    uint32_t acc_4 = reinterpret_cast<uint32_t const&>(tmp_4);
+    uint32_t acc_5 = reinterpret_cast<uint32_t const&>(tmp_5);
+    uint32_t acc_6 = reinterpret_cast<uint32_t const&>(tmp_6);
+    uint32_t acc_7 = reinterpret_cast<uint32_t const&>(tmp_7);
+
+    fmha::stg(ptr + 0 * step_m + 0 * step_n, make_uint2(acc_0, acc_1));
+    fmha::stg(ptr + 1 * step_m + 0 * step_n, make_uint2(acc_2, acc_3));
+    fmha::stg(ptr + 0 * step_m + 1 * step_n, make_uint2(acc_4, acc_5));
+    fmha::stg(ptr + 1 * step_m + 1 * step_n, make_uint2(acc_6, acc_7));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Store_accumulator<fmha::Ampere_imma_int8_int32_traits, 8> {
+  // The traits.
+  using Traits = fmha::Ampere_imma_int8_int32_traits;
+  // The fragment.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // Store.
+  inline __device__ void store(char* ptr, int64_t step_m, int64_t step_n, Accumulator const& acc,
+                               uint32_t) {
+    // Pack pairs of values.
+    uint16_t tmp_00 = pack_char2(acc.reg(0), acc.reg(1));
+    uint16_t tmp_01 = pack_char2(acc.reg(4), acc.reg(5));
+    uint16_t tmp_10 = pack_char2(acc.reg(2), acc.reg(3));
+    uint16_t tmp_11 = pack_char2(acc.reg(6), acc.reg(7));
+
+    // Store to memory.
+    fmha::stg(ptr + 0 * step_m + 0 * step_n, tmp_00);
+    fmha::stg(ptr + 1 * step_m + 0 * step_n, tmp_10);
+    fmha::stg(ptr + 0 * step_m + 1 * step_n, tmp_01);
+    fmha::stg(ptr + 1 * step_m + 1 * step_n, tmp_11);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF>
+struct Store_accumulator<
+    fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, 16> {
+  // The traits.
+  using Traits = fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+  // The fragment.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // The number of rows accessed by each thread.
+  enum { ROWS_PER_THREAD = GMMA_M / 8 / 4 };
+
+  // The number of columns access by each thread.
+  // Note there are 2 elements per reg.
+  enum { COLUMNS_PER_THREAD = GMMA_N / 4 / 2 };
+
+  // The number of accumulator held by each thread, per HGMMA instruction.
+  enum { ELEMENT_PER_THREAD = ROWS_PER_THREAD * COLUMNS_PER_THREAD };
+
+  // Store.
+  inline __device__ void store(char* ptr, int64_t step_m, int64_t step_n, Accumulator const& acc,
+                               uint32_t scale) {
+#pragma unroll
+    for (int col_idx = 0; col_idx < COLUMNS_PER_THREAD; ++col_idx) {
+#pragma unroll
+      for (int row_idx = 0; row_idx < ROWS_PER_THREAD; ++row_idx) {
+        uint32_t acc_0 = fmha::hmul2(acc.reg(col_idx * ROWS_PER_THREAD + row_idx), scale);
+        // float one = 1.f;
+        // if(col_idx > 2){
+        //   acc_0 = float2_to_half2(one, one);
+        // }
+        int64_t offset = (int64_t)row_idx * step_m + (int64_t)col_idx * step_n;
+        fmha::stg(ptr + offset, acc_0);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Input_type_A,
+          typename Input_type_B, typename Output_type>
+struct Store_accumulator<
+    fmha::Hopper_qgmma_fp8_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF, Input_type_A,
+                                       Input_type_B, Output_type>,
+    32> {
+  // The traits.
+  using Traits = fmha::Hopper_qgmma_fp8_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF,
+                                                    Input_type_A, Input_type_B, Output_type>;
+  // The fragment.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // The number of rows accessed by each thread.
+  enum { ROWS_PER_THREAD = GMMA_M / 8 / 4 };
+
+  // The number of columns access by each thread.
+  // Note there are 2 elements per reg.
+  enum { COLUMNS_PER_THREAD = GMMA_N / 8 };
+
+  // The number of accumulator held by each thread, per HGMMA instruction.
+  enum { ELEMENT_PER_THREAD = ROWS_PER_THREAD * COLUMNS_PER_THREAD };
+
+  // Store.
+  inline __device__ void store(char* ptr, int64_t step_m, int64_t step_n, Accumulator const& acc,
+                               uint32_t scale) {
+    float const scalef = reinterpret_cast<float const&>(scale);
+#pragma unroll
+    for (int col_idx = 0; col_idx < COLUMNS_PER_THREAD; ++col_idx) {
+#pragma unroll
+      for (int row_idx = 0; row_idx < ROWS_PER_THREAD; ++row_idx) {
+        float const acc_0 = acc.elt((col_idx * ROWS_PER_THREAD + row_idx) * 2 + 0) * scalef;
+        float const acc_1 = acc.elt((col_idx * ROWS_PER_THREAD + row_idx) * 2 + 1) * scalef;
+        uint2 acc_;
+        acc_.x = reinterpret_cast<uint32_t const&>(acc_0);
+        acc_.y = reinterpret_cast<uint32_t const&>(acc_1);
+        int64_t offset = (int64_t)row_idx * step_m + (int64_t)col_idx * step_n;
+        fmha::stg(ptr + offset, acc_);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF>
+struct Store_accumulator<
+    fmha::Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, 32> {
+  // The traits.
+  using Traits = fmha::Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+  // The fragment.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // The number of rows accessed by each thread.
+  enum { ROWS_PER_THREAD = GMMA_M / 8 / 4 };
+
+  // The number of columns access by each thread.
+  // Note there are 2 elements per reg.
+  enum { COLUMNS_PER_THREAD = GMMA_N / 8 };
+
+  // The number of accumulator held by each thread, per HGMMA instruction.
+  enum { ELEMENT_PER_THREAD = ROWS_PER_THREAD * COLUMNS_PER_THREAD };
+
+  // Store.
+  inline __device__ void store(char* ptr, int64_t step_m, int64_t step_n, Accumulator const& acc,
+                               uint32_t scale) {
+#pragma unroll
+    for (int col_idx = 0; col_idx < COLUMNS_PER_THREAD; ++col_idx) {
+#pragma unroll
+      for (int row_idx = 0; row_idx < ROWS_PER_THREAD; ++row_idx) {
+        int32_t const acc_0 = acc.elt((col_idx * ROWS_PER_THREAD + row_idx) * 2 + 0);
+        int32_t const acc_1 = acc.elt((col_idx * ROWS_PER_THREAD + row_idx) * 2 + 1);
+        uint2 acc_;
+        acc_.x = reinterpret_cast<uint32_t const&>(acc_0);
+        acc_.y = reinterpret_cast<uint32_t const&>(acc_1);
+        int64_t offset = (int64_t)row_idx * step_m + (int64_t)col_idx * step_n;
+        fmha::stg(ptr + offset, acc_);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static __device__ inline uint16_t pack_e4m3x2(float const x, float const y) {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  uint16_t storage;
+  asm volatile("{cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;}\n" : "=h"(storage) : "f"(x), "f"(y));
+  return storage;
+#else
+  assert(false);
+  return 0;
+#endif
+}
+
+static __device__ inline uint16_t pack_e5m2x2(float const x, float const y) {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  uint16_t storage;
+  asm volatile("{cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;}\n" : "=h"(storage) : "f"(x), "f"(y));
+  return storage;
+#else
+  assert(false);
+  return 0;
+#endif
+}
+
+template <typename fp8_t>
+__device__ inline uint16_t pack_fp8x2(float const x, float const y);
+
+template <>
+__device__ inline uint16_t pack_fp8x2<fmha::e4m3_t>(float const x, float const y) {
+  return pack_e4m3x2(x, y);
+}
+
+template <>
+__device__ inline uint16_t pack_fp8x2<fmha::e5m2_t>(float const x, float const y) {
+  return pack_e5m2x2(x, y);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Input_type_A,
+          typename Input_type_B, typename Output_type>
+struct Store_accumulator<
+    fmha::Hopper_qgmma_fp8_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF, Input_type_A,
+                                       Input_type_B, Output_type>,
+    8> {
+  // The traits.
+  using Traits = fmha::Hopper_qgmma_fp8_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF,
+                                                    Input_type_A, Input_type_B, Output_type>;
+  // The fragment.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // The number of rows accessed by each thread.
+  enum { ROWS_PER_THREAD = GMMA_M / 8 / 4 };
+
+  // The number of columns access by each thread.
+  // Note there are 2 elements per reg.
+  enum { COLUMNS_PER_THREAD = GMMA_N / 8 };
+
+  // The number of accumulator held by each thread, per HGMMA instruction.
+  enum { ELEMENT_PER_THREAD = ROWS_PER_THREAD * COLUMNS_PER_THREAD };
+
+  // Store.
+  inline __device__ void store(char* ptr, int64_t step_m, int64_t step_n, Accumulator const& acc,
+                               uint32_t) {
+#pragma unroll
+    for (int col_idx = 0; col_idx < COLUMNS_PER_THREAD; ++col_idx) {
+#pragma unroll
+      for (int row_idx = 0; row_idx < ROWS_PER_THREAD; ++row_idx) {
+        float const acc_0 = acc.elt((col_idx * ROWS_PER_THREAD + row_idx) * 2 + 0);
+        float const acc_1 = acc.elt((col_idx * ROWS_PER_THREAD + row_idx) * 2 + 1);
+        // uint16_t acc_ = pack_e4m3x2(acc_0, acc_1);
+        uint16_t acc_ = pack_fp8x2<Input_type_A>(acc_0, acc_1);
+        int64_t offset = (int64_t)row_idx * step_m + (int64_t)col_idx * step_n;
+        fmha::stg(ptr + offset, acc_);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF>
+struct Store_accumulator<
+    fmha::Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, 8> {
+  // The traits.
+  using Traits = fmha::Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+  // The fragment.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // The number of rows accessed by each thread.
+  enum { ROWS_PER_THREAD = GMMA_M / 8 / 4 };
+
+  // The number of columns access by each thread.
+  // Note there are 2 elements per reg.
+  enum { COLUMNS_PER_THREAD = GMMA_N / 8 };
+
+  // The number of accumulator held by each thread, per HGMMA instruction.
+  enum { ELEMENT_PER_THREAD = ROWS_PER_THREAD * COLUMNS_PER_THREAD };
+
+  // Store.
+  inline __device__ void store(char* ptr, int64_t step_m, int64_t step_n, Accumulator const& acc,
+                               uint32_t) {
+#pragma unroll
+    for (int col_idx = 0; col_idx < COLUMNS_PER_THREAD; ++col_idx) {
+#pragma unroll
+      for (int row_idx = 0; row_idx < ROWS_PER_THREAD; ++row_idx) {
+        uint32_t const acc_0 = acc.reg((col_idx * ROWS_PER_THREAD + row_idx) * 2 + 0);
+        uint32_t const acc_1 = acc.reg((col_idx * ROWS_PER_THREAD + row_idx) * 2 + 1);
+        uint16_t acc_ = pack_char2(acc_0, acc_1);
+        int64_t offset = (int64_t)row_idx * step_m + (int64_t)col_idx * step_n;
+        fmha::stg(ptr + offset, acc_);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int BITS_PER_ELEMENT>
+struct Gmem_tile_ps {
+  // The associated MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of elements per STG.
+  enum { ELEMENTS_PER_STG = 2 };
+
+  // The size in bytes of each element.
+  enum { BYTES_PER_ELEMENT = BITS_PER_ELEMENT / 8 };
+
+  // The size of each STG.
+  enum { BYTES_PER_STG = ELEMENTS_PER_STG * BYTES_PER_ELEMENT };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = Cta_tile::N * BYTES_PER_ELEMENT };
+
+  // // DEBUG.
+  // static_assert(BYTES_PER_ROW == 384 || BYTES_PER_ROW == 768 || BYTES_PER_ROW == 1536, "");
+  // // END OF DEBUG.
+
+  // Ctor.
+  inline __device__ Gmem_tile_ps(void* ptr, int64_t const params_stride_in_bytes,
+                                 uint32_t const params_scale, int tidx, int cta_row_offset = 0)
+      : params_stride_in_bytes_(params_stride_in_bytes),
+        params_scale_(params_scale),
+        ptr_(reinterpret_cast<char*>(ptr)) {
+    // For storing P and S, we do not take into account variable sequence length.
+
+    // The block index for the batch.
+    int const bidb = blockIdx.y;
+    // The block index for the head.
+    int const bidh = blockIdx.x;
+    // The block index.
+    int bidx = bidb * gridDim.x + bidh;
+
+    // Decompose the position of the thread into warp/lane.
+    int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+    // Compute the position in the sequence (within the CTA for the moment).
+    int row = warp % Cta_tile::WARPS_M * Mma_tile::M_PER_MMA + lane / 4 + cta_row_offset;
+    // Compute the position of the thread in the row.
+    int col = warp / Cta_tile::WARPS_M * Mma_tile::N_PER_MMA + lane % 4 * ELEMENTS_PER_STG;
+
+    // The offset of the 1st row written by the thread. We store the P matrix interleaved.
+    int64_t row_offset = (int64_t)row * params_stride_in_bytes_ + bidx * BYTES_PER_ROW;
+    // Finalize the pointer.
+    ptr_ += row_offset + col * BYTES_PER_ELEMENT;
+  }
+
+  // Store data to memory.
+  template <typename Accumulators, int M, int N>
+  inline __device__ void store(Accumulators const (&acc)[M][N]) {
+    // A thread holds packet of 2 elements. In 2x2 tile per MMA.
+    int64_t const step_m = 8 * params_stride_in_bytes_;
+    int64_t const step_n = 8 * BYTES_PER_ELEMENT;
+
+// Store the different accumulators.
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < N; ++ni) {
+        int64_t offset = (int64_t)mi * Mma_tile::M_PER_MMA_PER_CTA * params_stride_in_bytes_ +
+                         ni * Mma_tile::N_PER_MMA_PER_CTA * BYTES_PER_ELEMENT;
+        Store_accumulator<Traits, BITS_PER_ELEMENT> delegate;
+        delegate.store(ptr_ + offset, step_m, step_n, acc[mi][ni], params_scale_);
+      }
+    }
+  }
+
+  // Move to the next location.
+  inline __device__ void move() { ptr_ += (int64_t)Cta_tile::M * params_stride_in_bytes_; }
+
+  inline __device__ void move_n() { ptr_ += (int64_t)Cta_tile::N * BYTES_PER_ELEMENT; }
+
+  // The stride between rows for the QKV matrice.
+  int64_t const params_stride_in_bytes_;
+  // The scale to apply before storing the element.
+  uint32_t const params_scale_;
+  // The pointer.
+  char* ptr_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Gmem_tile_ps<Volta_hmma_fp16_traits, Cta_tile, 16> {
+  // The traits class.
+  using Traits = Volta_hmma_fp16_traits;
+  // The associated MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of elements per STG.
+  enum { ELEMENTS_PER_STG = 4 };
+
+  // The size in bytes of each element.
+  enum { BYTES_PER_ELEMENT = 2 };
+
+  // The size of each STG.
+  enum { BYTES_PER_STG = ELEMENTS_PER_STG * BYTES_PER_ELEMENT };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = Cta_tile::N * BYTES_PER_ELEMENT };
+
+  // Ctor.
+  inline __device__ Gmem_tile_ps(void* ptr, int64_t const params_stride_in_bytes,
+                                 uint32_t const params_scale, int tidx, int cta_row_offset = 0)
+      : params_stride_in_bytes_(params_stride_in_bytes),
+        params_scale_(params_scale),
+        ptr_(reinterpret_cast<char*>(ptr)) {
+    // For storing P and S, we do not take into account variable sequence lengths.
+
+    // The block index for the batch.
+    int const bidb = blockIdx.y;
+    // The block index for the head.
+    int const bidh = blockIdx.x;
+    // The block index.
+    int bidx = bidb * gridDim.x + bidh;
+
+    // Decompose the position of the thread into warp/lane.
+    int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+    // DEBUG.
+    static_assert(Mma_tile::M_PER_MMA == 16 && Mma_tile::N_PER_MMA == 16, "");
+    // END OF DEBUG.
+
+    // The position of the warp.
+    int warp_row = warp % Cta_tile::WARPS_M * Mma_tile::M_PER_MMA;
+    int warp_col = warp / Cta_tile::WARPS_M * Mma_tile::N_PER_MMA;
+
+    // Compute the position of the thread (within the CTA for the moment).
+    int row = warp_row + (lane & 0x10) / 2 + (lane & 0x07);
+    int col = warp_col + (lane & 0x08) / 2;
+
+    // // DEBUG.
+    // printf("tidx=%3d row=%3d col=%3d\n", tidx, row, col);
+    // // END OF DEBUG.
+
+    // The offset of the 1st row written by the thread. We store the P matrix interleaved.
+    int64_t row_offset =
+        (int64_t)row * params_stride_in_bytes_ + bidx * BYTES_PER_ROW + cta_row_offset;
+
+    // Finalize the pointer.
+    ptr_ += row_offset + col * BYTES_PER_ELEMENT;
+  }
+
+  // Store data to memory.
+  template <typename Accumulators, int M, int N>
+  inline __device__ void store(Accumulators const (&acc)[M][N]) {
+// Store the different accumulators.
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < N; ++ni) {
+        // Scale the accumulators.
+        uint32_t acc_0 = fmha::hmul2(acc[mi][ni].reg(0), params_scale_);
+        uint32_t acc_1 = fmha::hmul2(acc[mi][ni].reg(1), params_scale_);
+        uint32_t acc_2 = fmha::hmul2(acc[mi][ni].reg(2), params_scale_);
+        uint32_t acc_3 = fmha::hmul2(acc[mi][ni].reg(3), params_scale_);
+
+        // The offsets.
+        int row = mi * Mma_tile::M_PER_MMA_PER_CTA;
+        int col = ni * Mma_tile::N_PER_MMA_PER_CTA * BYTES_PER_ELEMENT;
+
+        // The offset in bytes.
+        int64_t offset = (int64_t)row * params_stride_in_bytes_ + col;
+
+        // In one MMA, 16 FP16s are interleaved between threads i and i+8 in groups of 4.
+        fmha::stg(&ptr_[offset + 0 * BYTES_PER_ELEMENT], make_uint2(acc_0, acc_1));
+        fmha::stg(&ptr_[offset + 8 * BYTES_PER_ELEMENT], make_uint2(acc_2, acc_3));
+      }
+    }
+  }
+
+  // Move to the next location.
+  inline __device__ void move() { ptr_ += (int64_t)Cta_tile::M * params_stride_in_bytes_; }
+
+  // The stride between rows for the QKV matrice.
+  int64_t const params_stride_in_bytes_;
+  // The scale to apply before storing the element.
+  uint32_t const params_scale_;
+  // The pointer.
+  char* ptr_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int BITS_PER_ELEMENT>
+struct Gmem_tile_p : public Gmem_tile_ps<Traits, Cta_tile, BITS_PER_ELEMENT> {
+  // The base class.
+  using Base = Gmem_tile_ps<Traits, Cta_tile, BITS_PER_ELEMENT>;
+
+  // Ctor.
+  inline __device__ Gmem_tile_p(void* ptr, int64_t const params_stride_in_bytes,
+                                uint32_t const params_scale, int tidx, int cta_row_offset = 0)
+      : Base(ptr, params_stride_in_bytes, params_scale, tidx, cta_row_offset) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Not super proud of this. Need to refactor.
+template <typename Traits, typename Cta_tile, int BITS_PER_ELEMENT>
+struct Gmem_tile_ps_hopper {
+  // The associated MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of elements per STG.
+  enum { ELEMENTS_PER_STG = 2 };
+
+  // The size in bytes of each element.
+  enum { BYTES_PER_ELEMENT = BITS_PER_ELEMENT / 8 };
+
+  // The size of each STG.
+  enum { BYTES_PER_STG = ELEMENTS_PER_STG * BYTES_PER_ELEMENT };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = Cta_tile::N * BYTES_PER_ELEMENT };
+
+  // Ctor.
+  inline __device__ Gmem_tile_ps_hopper(void* ptr, int64_t const params_stride_in_bytes,
+                                        int64_t const bytes_per_row, uint32_t const params_scale,
+                                        int tidx)
+      : params_stride_in_bytes_(params_stride_in_bytes),
+        params_scale_(params_scale),
+        ptr_(reinterpret_cast<char*>(ptr)) {
+    // For storing P and S, we do not take into account variable sequence length.
+
+    // The block index for the batch.
+    int const bidb = blockIdx.y;
+    // The block index for the head.
+    int const bidh = blockIdx.x;
+    // The block index.
+    int bidx = bidb * gridDim.x + bidh;
+
+    // Decompose the position of the thread into warp/lane.
+    int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+    int warpgroup_idx = warp / 4;
+    int warp_idx_within_warpgroup = warp % 4;
+
+    // Compute the position in the sequence (within the CTA for the moment).
+    int row = warp_idx_within_warpgroup * (Mma_tile::M_PER_MMA / 4) + lane / 4;
+    // Compute the position of the thread in the row.
+    int col = warpgroup_idx * Mma_tile::N_PER_MMA + lane % 4 * ELEMENTS_PER_STG;
+
+    // The offset of the 1st row written by the thread. We store the P matrix interleaved.
+    int64_t row_offset = (int64_t)row * params_stride_in_bytes_ + bidx * bytes_per_row;
+    // Finalize the pointer.
+    ptr_ += row_offset + col * BYTES_PER_ELEMENT;
+  }
+
+  // Ctor.
+  inline __device__ Gmem_tile_ps_hopper(void* ptr, int64_t const params_stride_in_bytes,
+                                        uint32_t const params_scale, int tidx)
+      : Gmem_tile_ps_hopper(ptr, params_stride_in_bytes, BYTES_PER_ROW, params_scale, tidx) {}
+
+  // Store data to memory.
+  template <typename Accumulators, int M, int N>
+  inline __device__ void store(Accumulators const (&acc)[M][N]) {
+    // A thread holds packet of 2 elements. In 2x2 tile per MMA.
+    // Need to figure out if we need this for hopper.
+    int64_t const step_m = 8 * (this->params_stride_in_bytes_);
+    int64_t const step_n = 8 * BYTES_PER_ELEMENT;
+
+// Store the different accumulators.
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < N; ++ni) {
+        int64_t offset =
+            (int64_t)mi * Mma_tile::M_PER_MMA_PER_CTA * (this->params_stride_in_bytes_) +
+            ni * Mma_tile::N_PER_MMA_PER_CTA * BYTES_PER_ELEMENT;
+
+        Store_accumulator<Traits, BITS_PER_ELEMENT> delegate;
+        delegate.store(this->ptr_ + offset, step_m, step_n, acc[mi][ni], this->params_scale_);
+      }
+    }
+  }
+
+  // Move to the next location.
+  inline __device__ void move() { ptr_ += (int64_t)Cta_tile::M * params_stride_in_bytes_; }
+
+  // The stride between rows for the QKV matrice.
+  int64_t const params_stride_in_bytes_;
+  // The scale to apply before storing the element.
+  uint32_t const params_scale_;
+  // The pointer.
+  char* ptr_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int BITS_PER_ELEMENT>
+struct Gmem_tile_s : public Gmem_tile_ps<Traits, Cta_tile, BITS_PER_ELEMENT> {
+  // The base class.
+  using Base = Gmem_tile_ps<Traits, Cta_tile, BITS_PER_ELEMENT>;
+
+  // Ctor.
+  inline __device__ Gmem_tile_s(void* ptr, int64_t const params_stride_in_bytes,
+                                uint32_t const params_scale, int tidx)
+      : Base(ptr, params_stride_in_bytes, params_scale, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int BITS_PER_ELEMENT>
+struct Gmem_tile_s<Ampere_hmma_fp32_traits, Cta_tile, BITS_PER_ELEMENT>
+    : public Gmem_tile_ps<Ampere_hmma_fp16_traits, Cta_tile, BITS_PER_ELEMENT> {
+  // The base class.
+  using Base = Gmem_tile_ps<Ampere_hmma_fp16_traits, Cta_tile, BITS_PER_ELEMENT>;
+
+  // Ctor.
+  inline __device__ Gmem_tile_s(void* ptr, int64_t const params_stride_in_bytes,
+                                uint32_t const params_scale, int tidx, int cta_row_offset = 0)
+      : Base(ptr, params_stride_in_bytes,
+             float_to_half2(reinterpret_cast<float const&>(params_scale)), tidx, cta_row_offset) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/gmem_tile_qkv.h b/csrc/fmha_v2/fmha/gmem_tile_qkv.h
new file mode 100644
index 0000000000..0c0af5c8e4
--- /dev/null
+++ b/csrc/fmha_v2/fmha/gmem_tile_qkv.h
@@ -0,0 +1,167 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+namespace fmha {
+namespace v1 {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The number of bits per element.
+    int BITS_PER_ELEMENT,
+    // The number of rows of Q, K or V loaded by this tile.
+    int ROWS_,
+    // The number of columns.
+    int COLS,
+    // The number of valid columns
+    int VALID_COLS,
+    // Do we use LDGSTS?
+    bool USE_LDGSTS_,
+    // Are attention heads interleaved?
+    bool HEADS_INTERLEAVED,
+    // Number of matrices
+    int NUM_MATS = 3,
+    // Is sliding window attention used ?
+    bool SLIDING_WINDOW_ATTENTION = false>
+struct Gmem_tile_qkv {
+  // The size of each LDG.
+  enum { BYTES_PER_LDG = 16 };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = COLS * BITS_PER_ELEMENT / 8 };
+
+  // The number of threads to load a "row" of the matrix.
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDG };
+
+  // The valid size of a row in bytes.
+  enum { VALID_BYTES_PER_ROW = VALID_COLS * BITS_PER_ELEMENT / 8 };
+
+  // The valid number of threads to load a "row" of the matrix.
+  enum { VALID_THREADS_PER_ROW = VALID_BYTES_PER_ROW / BYTES_PER_LDG };
+
+  // The number of "rows" loaded per LDG.
+  enum { ROWS_PER_LDG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // The number of rows.
+  enum { ROWS = ROWS_ };
+
+  // The number of LDGs needed to load a chunk of the Q matrix.
+  enum { LDGS = fmha::Div_up<ROWS, ROWS_PER_LDG>::VALUE };
+
+  // The number of predicate registers.
+  enum { PRED_REGS = fmha::Compute_number_of_pred_regs<LDGS>::VALUE };
+
+  // Make sure we use a single register to store predicates.
+  static_assert(PRED_REGS == 1, "");
+
+  // We do not use LDGSTS (for the moment).
+  enum { USE_LDGSTS = USE_LDGSTS_ };
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_qkv(Params const& params, int qkv_offset, Block_info const& binfo,
+                                  int tidx, int cta_row_offset = 0)
+
+      // in PACKED_QKV, q_stride = k_stride = v_stride
+      : params_qkv_stride_in_bytes_(params.q_stride_in_bytes),
+        qkv_ptr_(reinterpret_cast<char const*>(params.qkv_ptr)) {
+    // Compute the position in the sequence (within the CTA for the moment).
+    int row = tidx / THREADS_PER_ROW;
+    // Compute the position of the thread in the row.
+    int col = tidx % THREADS_PER_ROW;
+
+    // Prepare predicates.
+    uint32_t preds[LDGS];
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      preds[ii] = row + ii * ROWS_PER_LDG < ROWS;
+    }
+
+    // Pack the predicates.
+    preds_[0] = fmha::pack_predicates(preds);
+
+    // The row offset in the batched GEMM. For each seq element, we store QKV in that order.
+    int64_t row_offset = (int64_t)(row + cta_row_offset) * params_qkv_stride_in_bytes_;
+    // Add the block index.
+    int idx;
+    if (HEADS_INTERLEAVED) {
+      idx = binfo.bidx * NUM_MATS + qkv_offset;
+    } else {
+      idx = (params.b * params.s * NUM_MATS + qkv_offset) * params.h + binfo.bidh;
+    }
+    // Assemble the final pointer.
+    qkv_ptr_ += row_offset + idx * VALID_BYTES_PER_ROW + col * BYTES_PER_LDG;
+
+    // active threads
+    is_active_ = col < VALID_THREADS_PER_ROW;
+  }
+
+  // Store data to shared memory.
+  template <typename Smem_tile>
+  inline __device__ void commit(Smem_tile& smem_tile) {
+    if (!USE_LDGSTS) {
+      smem_tile.store(fetch_);
+    }
+  }
+
+  // Load data from memory.
+  template <typename Smem_tile>
+  inline __device__ void load(Smem_tile& smem_tile) {
+    void const* ptrs[LDGS];
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      ptrs[ii] = qkv_ptr_ + (int64_t)ii * ROWS_PER_LDG * params_qkv_stride_in_bytes_;
+    }
+    if (USE_LDGSTS) {
+      smem_tile.store(ptrs, preds_);
+    } else {
+      fmha::ldg(fetch_, ptrs, preds_);
+    }
+  }
+
+  // Load data from global memory, shared mem is not needed
+  inline __device__ void load() {
+    void const* ptrs[LDGS];
+    if (is_active_) {
+#pragma unroll
+      for (int ii = 0; ii < LDGS; ++ii) {
+        ptrs[ii] = qkv_ptr_ + (int64_t)ii * ROWS_PER_LDG * params_qkv_stride_in_bytes_;
+      }
+      fmha::ldg(fetch_, ptrs, preds_);
+    }
+  }
+
+  // Move the pointer to the next location.
+  inline __device__ void move() { qkv_ptr_ += (int64_t)ROWS * params_qkv_stride_in_bytes_; }
+
+  // The stride between rows for the QKV matrice.
+  int64_t const params_qkv_stride_in_bytes_;
+  // The pointer.
+  char const* qkv_ptr_;
+  // The register to store predicates.
+  uint32_t preds_[PRED_REGS];
+  // The fetch registers.
+  uint4 fetch_[LDGS];
+  // The active LDG threads
+  bool is_active_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace v1
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/gmem_tile_qkv_packed.h b/csrc/fmha_v2/fmha/gmem_tile_qkv_packed.h
new file mode 100644
index 0000000000..00797d0a01
--- /dev/null
+++ b/csrc/fmha_v2/fmha/gmem_tile_qkv_packed.h
@@ -0,0 +1,1307 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+#include <fmha/traits.h>
+#include <fused_multihead_attention.h>
+
+namespace fmha {
+namespace v2 {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int USE_LDGSTS>
+struct Ldgsts_helper {
+  template <typename This, typename Smem_tile, int LDGS>
+  static inline __device__ void load(This* this_, Smem_tile& smem_tile, void const* (&ptrs)[LDGS],
+                                     uint32_t (&preds)[LDGS]) {
+    fmha::pack_predicates(this_->preds_, preds);
+    smem_tile.store(ptrs, this_->preds_);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Ldgsts_helper<0> {
+  template <typename This, typename Smem_tile, int LDGS>
+  static inline __device__ void load(This* this_, Smem_tile& smem_tile, void const* (&ptrs)[LDGS],
+                                     uint32_t (&preds)[LDGS]) {
+#if 0
+        fmha::pack_predicates(this_->preds_, preds);
+        fmha::ldg(this_->fetch_, ptrs, this_->preds_);
+#else
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      this_->fetch_[ii] = make_uint4(0u, 0u, 0u, 0u);
+    }
+    // not packing predicates removes restrictions (e.g. FP16 384, 4 warps)
+    Ldg_functor<uint4, LDGS> fct(this_->fetch_, ptrs);
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      fct.ldgsts(ii, preds[ii]);
+    }
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The number of bits per element.
+    int BITS_PER_ELEMENT_,
+    // The number of rows of Q, K or V loaded by this tile.
+    int ROWS_,
+    // The number of columns (padded, e.g 64).
+    int COLS,
+    // The actual number of columns (unpadded, e.g 40)
+    int VALID_COLS_,
+    // Do we use LDGSTS?
+    bool USE_LDGSTS_,
+    // Are attention heads interleaved?
+    bool HEADS_INTERLEAVED,
+    // The number of matrices
+    int NUM_MATS = 3,
+    // Is sliding window attention used ?
+    bool SLIDING_WINDOW_ATTENTION = false>
+struct Gmem_tile_qkv {
+  // The size of each LDG.
+  enum { BYTES_PER_LDG = 16 };
+
+  // The number of bits/bytes of element
+  enum { BITS_PER_ELEMENT = BITS_PER_ELEMENT_ };
+
+  enum { BYTES_PER_ELEMENT = BITS_PER_ELEMENT_ / 8 };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = COLS * BITS_PER_ELEMENT / 8 };
+
+  // The number of threads to load a "row" of the matrix.
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDG };
+
+  // The valid size of a row in bytes (without paddings).
+  enum { VALID_COLS = VALID_COLS_ };
+
+  // The amount of bytes that are valid per row.
+  enum { VALID_BYTES_PER_ROW = VALID_COLS * BITS_PER_ELEMENT / 8 };
+
+  // The number of "rows" loaded per LDG.
+  enum { ROWS_PER_LDG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // The number of rows.
+  enum { ROWS = ROWS_ };
+
+  // The number of LDGs needed to load a chunk of the Q matrix.
+  enum { LDGS = fmha::Div_up<ROWS, ROWS_PER_LDG>::VALUE };
+
+  // The number of predicate registers.
+  enum { PRED_REGS = fmha::Compute_number_of_pred_regs<LDGS>::VALUE };
+
+  // Is it Hopper?
+  enum {
+    IS_HOPPER = std::is_same<typename Traits::Gpu_arch, typename fmha::Hopper>::value == true
+  };
+
+  // Make sure we use a single register to store predicates. Do not throw for Hopper for now.
+  static_assert(!USE_LDGSTS_ || PRED_REGS == 1 || IS_HOPPER, "");
+
+  // We do not use LDGSTS (for the moment).
+  enum { USE_LDGSTS = USE_LDGSTS_ };
+
+  // Ctor for bert::Fused_multihead_attention_params_v2 class
+  template <typename Block_info>
+  inline __device__ Gmem_tile_qkv(bert::Fused_multihead_attention_params_v2 const& params,
+                                  int qkv_offset, Block_info const& binfo, int tidx,
+                                  int cta_row_offset = 0, int cta_col_offset_in_bytes = 0)
+      : Gmem_tile_qkv(params.qkv_ptr, params.q_stride_in_bytes, params.d, params.dv, params.h,
+                      qkv_offset, binfo, tidx, params.h_kv, cta_row_offset,
+                      cta_col_offset_in_bytes) {}
+
+  // Ctor for other param classes (such as Qkv_params in train_ops)
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_qkv(Params const& params, int qkv_offset, Block_info const& binfo,
+                                  int tidx, int cta_row_offset = 0, int cta_col_offset_in_bytes = 0)
+      : Gmem_tile_qkv(params.qkv_ptr, params.q_stride_in_bytes, params.d, params.dv, params.h,
+                      qkv_offset, binfo, tidx, cta_row_offset, cta_col_offset_in_bytes) {}
+
+  // Ctor.
+  template <typename Block_info>
+  inline __device__ Gmem_tile_qkv(void* qkv_ptr, size_t qkv_stride_in_bytes, int d, int dv,
+                                  int num_heads, int qkv_offset, Block_info const& binfo, int tidx,
+                                  int num_kv_heads = 0, int cta_row_offset = 0,
+                                  int cta_col_offset_in_bytes = 0)
+      : params_qkv_stride_in_bytes_(qkv_stride_in_bytes),
+        actual_seqlen_(binfo.actual_seqlen),
+        qkv_ptr_(reinterpret_cast<char*>(qkv_ptr)) {
+    // Compute the position in the sequence (within the CTA for the moment).
+    int row = tidx / THREADS_PER_ROW;
+    // Compute the position of the thread in the row.
+    int col = tidx % THREADS_PER_ROW;
+
+    // We must store the value to update the predicates in "load".
+    row_ = row;
+    // Do not load/store if the thread is in the padded area
+    col_in_bytes_ = cta_col_offset_in_bytes + col * BYTES_PER_LDG;
+
+    // The row offset in the batched GEMM. For each seq element, we store QKV in that order.
+    int64_t row_offset = (int64_t)(row + cta_row_offset) * params_qkv_stride_in_bytes_;
+    // Add the byte index.
+    int64_t idx;
+
+    // Both MQA and GQA will use non HEADS_INTERLEAVED layout
+    if (num_kv_heads < num_heads) {
+      int const head_id = binfo.bidh;
+      int const kv_head_id = binfo.bidh / (num_heads / num_kv_heads);
+      // QKV layout [b, s, [q_hd, k_h'd, v_h'd]]
+      idx = binfo.sum_s * params_qkv_stride_in_bytes_;
+      if (qkv_offset == 0) {  // Q tensor
+        idx += head_id * VALID_BYTES_PER_ROW;
+      } else if (qkv_offset == 1) {  // K tensor
+        idx += (num_heads + kv_head_id) * VALID_BYTES_PER_ROW;
+      } else if (qkv_offset == 2) {  // V tensor
+        /*  When qkv_offset == 2, this is an instance of Gmem_tile_v defined in Kernel_traits:
+                using Gmem_tile_v = Gmem_tile_v_<Traits_o,
+                        Cta_tile_o,
+                        Traits_o::BITS_PER_ELEMENT_B,
+                        CTA_O_TILE_K,
+                        CTA_O_TILE_N,
+                        VALID_DV,   // instead of VALID_D
+                        USE_LDGSTS_V,
+                        HEADS_INTERLEAVED,
+                        3, // NUM_MATS
+                        SLIDING_WINDOW_ATTENTION>;
+            the 6th template argument is VALID_DV instead of VALID_D.
+            Thus, here VALID_COLS equals VALID_DV, and
+            VALID_BYTES_PER_ROW equals VALID_DV * BYTES_PER_ELEMENT,
+            and `kv_head_id * dv * BYTES_PER_ELEMENT` can be optimized to
+            `kv_head_id * VALID_BYTES_PER_ROW`. */
+        idx +=
+            (num_heads + num_kv_heads) * d * BYTES_PER_ELEMENT + kv_head_id * VALID_BYTES_PER_ROW;
+      }
+    } else if (HEADS_INTERLEAVED) {
+      // [b, s, h, [q_d, k_d, v_d]] aka bsh3d
+      // bidx = sum_s * params.h + bidh;
+      idx = (binfo.bidx * (2 * d + dv) + qkv_offset * d) * BYTES_PER_ELEMENT;
+    } else {
+      // [b, s, [q_hd, k_hd, v_hd]] aka bs3hd
+      idx = binfo.sum_s * params_qkv_stride_in_bytes_ +
+            qkv_offset * num_heads * d * BYTES_PER_ELEMENT + binfo.bidh * VALID_BYTES_PER_ROW;
+    }
+
+    // Assemble the final pointer.
+    qkv_ptr_ += row_offset + idx + col_in_bytes_;
+
+    // Take the CTA offset to modify the sequence length.
+    actual_seqlen_ -= cta_row_offset;
+
+    // Set the initial seq_len and qkv_offset in case of reinterating
+    actual_seqlen_init_ = actual_seqlen_;
+    qkv_ptr_init_ = qkv_ptr_;
+  }
+
+  // Store data to shared memory.
+  template <typename Smem_tile>
+  inline __device__ void commit(Smem_tile& smem_tile) {
+    if (!USE_LDGSTS) {
+      smem_tile.store(fetch_);
+    }
+  }
+
+  // Load data from memory.
+  template <typename Smem_tile>
+  inline __device__ void load(Smem_tile& smem_tile) {
+    uint32_t preds[LDGS];
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      preds[ii] = row_ + ii * (int)ROWS_PER_LDG < min((int)ROWS, actual_seqlen_);
+      preds[ii] &= col_in_bytes_ < VALID_BYTES_PER_ROW;
+    }
+
+    // Prepare the load pointers.
+    void const* ptrs[LDGS];
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      ptrs[ii] = qkv_ptr_ + (int64_t)ii * ROWS_PER_LDG * params_qkv_stride_in_bytes_;
+    }
+
+    // Trigger LDGSTS or the LDGs.
+    // The predicates protect against out-of-bound access in rows and cols
+    Ldgsts_helper<USE_LDGSTS>::load(this, smem_tile, ptrs, preds);
+  }
+
+  // Load data from memory.
+  inline __device__ void load() {
+    uint32_t preds[LDGS];
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      preds[ii] = row_ + ii * (int)ROWS_PER_LDG < min((int)ROWS, actual_seqlen_);
+    }
+
+    // Prepare the load pointers.
+    void const* ptrs[LDGS];
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      ptrs[ii] = qkv_ptr_ + (int64_t)ii * ROWS_PER_LDG * params_qkv_stride_in_bytes_;
+    }
+
+    // Trigger the LDGs.
+    if (col_in_bytes_ < VALID_BYTES_PER_ROW) {
+      fmha::pack_predicates(preds_, preds);
+      fmha::ldg(fetch_, ptrs, preds_);
+    } else {
+#pragma unroll
+      for (int ii = 0; ii < LDGS; ++ii) {
+        fetch_[ii] = make_uint4(0u, 0u, 0u, 0u);
+      }
+    }
+  }
+
+  // Move the pointer to the next row location.
+  inline __device__ void move(int const steps = 1) {
+    qkv_ptr_ += (int64_t)ROWS * params_qkv_stride_in_bytes_ * steps;
+    actual_seqlen_ -= (int)ROWS * steps;
+  }
+
+  // Move the pointer to the next row location by the offset (not step).
+  inline __device__ void move_by_offset(int const offset) {
+    qkv_ptr_ = qkv_ptr_init_ + (int64_t)offset * params_qkv_stride_in_bytes_;
+    actual_seqlen_ = actual_seqlen_init_ - (int)offset;
+  }
+
+  // Move the pointer to the next column location
+  inline __device__ void move_col(int const steps = 1) {
+    qkv_ptr_ += (int64_t)COLS * (BITS_PER_ELEMENT / 8) * steps;
+    // Update col_in_bytes_ to ensure load predicates work
+    col_in_bytes_ += THREADS_PER_ROW * BYTES_PER_LDG * steps;
+  }
+
+  inline __device__ void reset() {
+    qkv_ptr_ = qkv_ptr_init_;
+    actual_seqlen_ = actual_seqlen_init_;
+  }
+
+  // Rewind the pointer back to previous column location
+  inline __device__ void rewind_col(int const steps) {
+    qkv_ptr_ -= COLS * (BITS_PER_ELEMENT / 8) * steps;
+    // Update col_in_bytes_ to ensure load predicates work
+    col_in_bytes_ -= THREADS_PER_ROW * BYTES_PER_LDG * steps;
+  }
+
+  inline __device__ void move_to(int const step) {
+    qkv_ptr_ = qkv_ptr_init_ + (int64_t)ROWS * params_qkv_stride_in_bytes_ * step;
+    actual_seqlen_ = actual_seqlen_init_ - (int)ROWS * step;
+  }
+
+  // Store data to memory.
+  inline __device__ void store(uint4 const (&data)[LDGS]) {
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      char* ptr = qkv_ptr_ + (int64_t)ii * ROWS_PER_LDG * params_qkv_stride_in_bytes_;
+      if (((row_ + ii * ROWS_PER_LDG) < min(ROWS, actual_seqlen_)) &&
+          col_in_bytes_ < VALID_BYTES_PER_ROW /*TODO: double check*/) {
+        fmha::stg(ptr, data[ii]);
+      }
+    }
+  }
+
+  // The stride between rows for the QKV matrice.
+  int64_t params_qkv_stride_in_bytes_;
+  // The pointer.
+  char* qkv_ptr_;
+  char* qkv_ptr_init_;
+  // The register to store predicates.
+  uint32_t preds_[PRED_REGS];
+  // The fetch registers.
+  uint4 fetch_[LDGS];
+  // Keep track of the row and col the thread is processing as we move the tile.
+  int row_;
+  int col_in_bytes_;
+  // The sequence length.
+  int actual_seqlen_;
+  int actual_seqlen_init_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// We expect the Q/K/V layout to be [B, S, H, D] with variable sequence length support.
+template <
+    // The instruction traits.
+    typename Traits,
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The number of bits per element.
+    int BITS_PER_ELEMENT_,
+    // The number of rows of Q, K or V loaded by this tile.
+    int ROWS_,
+    // The number of columns (padded, e.g 64).
+    int COLS,
+    // The actual number of columns (unpadded, e.g 40)
+    int VALID_COLS_,
+    // Do we use LDGSTS?
+    bool USE_LDGSTS_,
+    // Are attention heads interleaved? (not used)
+    bool HEADS_INTERLEAVED = false,
+    // The number of matrices (not used)
+    int NUM_MATS = 1,
+    // Is sliding window attention used ?
+    bool SLIDING_WINDOW_ATTENTION = false>
+struct Gmem_tile_q_k_v {
+  // The size of each LDG.
+  enum { BYTES_PER_LDG = 16 };
+
+  // The number of bits/bytes of element
+  enum { BITS_PER_ELEMENT = BITS_PER_ELEMENT_ };
+
+  enum { BYTES_PER_ELEMENT = BITS_PER_ELEMENT_ / 8 };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = COLS * BITS_PER_ELEMENT / 8 };
+
+  // The number of threads to load a "row" of the matrix.
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDG };
+
+  // The valid size of a row in bytes (without paddings).
+  enum { VALID_COLS = VALID_COLS_ };
+
+  // The amount of bytes that are valid per row.
+  enum { VALID_BYTES_PER_ROW = VALID_COLS * BITS_PER_ELEMENT / 8 };
+
+  // The number of "rows" loaded per LDG.
+  enum { ROWS_PER_LDG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // The number of rows.
+  enum { ROWS = ROWS_ };
+
+  // The number of LDGs needed to load a chunk of the Q matrix.
+  enum { LDGS = fmha::Div_up<ROWS, ROWS_PER_LDG>::VALUE };
+
+  // The number of predicate registers.
+  enum { PRED_REGS = fmha::Compute_number_of_pred_regs<LDGS>::VALUE };
+
+  // Is it Hopper?
+  enum {
+    IS_HOPPER = std::is_same<typename Traits::Gpu_arch, typename fmha::Hopper>::value == true
+  };
+
+  // Make sure we use a single register to store predicates. Do not throw for Hopper for now.
+  static_assert(!USE_LDGSTS_ || PRED_REGS == 1 || IS_HOPPER, "");
+
+  // We do not use LDGSTS (for the moment).
+  enum { USE_LDGSTS = USE_LDGSTS_ };
+
+  // Ctor
+  // qkv_offset: 0 for Q, 1 for K, 2 for V
+  template <typename Block_info>
+  inline __device__ Gmem_tile_q_k_v(bert::Fused_multihead_attention_params_v2 const& params,
+                                    int qkv_offset, Block_info const& binfo, int tidx,
+                                    int cta_row_offset = 0, int cta_col_offset_in_bytes = 0) {
+    int seq_offset = 0;
+    if (qkv_offset == 0) {
+      // Q tensor
+      params_q_k_v_stride_in_bytes_ = params.q_stride_in_bytes;
+      q_k_v_ptr_ = reinterpret_cast<char*>(params.q_ptr);
+      actual_seqlen_ = binfo.actual_q_seqlen;
+      seq_offset = binfo.sum_s;
+    } else if (qkv_offset == 1) {
+      // K tensor
+      params_q_k_v_stride_in_bytes_ = params.k_stride_in_bytes;
+      q_k_v_ptr_ = reinterpret_cast<char*>(params.k_ptr);
+      actual_seqlen_ = binfo.actual_kv_seqlen;
+      seq_offset = binfo.sum_s_kv;
+    } else if (qkv_offset == 2) {
+      // V tensor
+      params_q_k_v_stride_in_bytes_ = params.v_stride_in_bytes;
+      q_k_v_ptr_ = reinterpret_cast<char*>(params.v_ptr);
+      actual_seqlen_ = binfo.actual_kv_seqlen;
+      seq_offset = binfo.sum_s_kv;
+    }
+
+    // Compute the position in the sequence (within the CTA for the moment).
+    int row = tidx / THREADS_PER_ROW;
+    // Compute the position of the thread in the row.
+    int col = tidx % THREADS_PER_ROW;
+
+    // We must store the value to update the predicates in "load".
+    row_ = row;
+    // Do not load/store if the thread is in the padded area
+    col_in_bytes_ = cta_col_offset_in_bytes + col * BYTES_PER_LDG;
+
+    // The row offset in the batched GEMM, including the sequence offset.
+    int64_t row_offset =
+        (int64_t)(row + cta_row_offset + seq_offset) * params_q_k_v_stride_in_bytes_;
+    // Add the head index.
+    int64_t idx = binfo.bidh;
+
+    // Assemble the final pointer.
+    q_k_v_ptr_ += row_offset + idx * VALID_BYTES_PER_ROW + col_in_bytes_;
+
+    // Take the CTA offset to modify the sequence length.
+    actual_seqlen_ -= cta_row_offset;
+
+    // Set the initial seq_len and qkv_offset in case of reinterating
+    actual_seqlen_init_ = actual_seqlen_;
+    q_k_v_ptr_init_ = q_k_v_ptr_;
+  }
+
+  // Store data to shared memory.
+  template <typename Smem_tile>
+  inline __device__ void commit(Smem_tile& smem_tile) {
+    if (!USE_LDGSTS) {
+      smem_tile.store(fetch_);
+    }
+  }
+
+  // Load data from memory.
+  template <typename Smem_tile>
+  inline __device__ void load(Smem_tile& smem_tile) {
+    uint32_t preds[LDGS];
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      preds[ii] = row_ + ii * (int)ROWS_PER_LDG < min((int)ROWS, actual_seqlen_);
+      preds[ii] &= col_in_bytes_ < VALID_BYTES_PER_ROW;
+    }
+
+    // Prepare the load pointers.
+    void const* ptrs[LDGS];
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      ptrs[ii] = q_k_v_ptr_ + (int64_t)ii * ROWS_PER_LDG * params_q_k_v_stride_in_bytes_;
+    }
+
+    // Trigger LDGSTS or the LDGs.
+    // The predicates protect against out-of-bound access in rows and cols
+    Ldgsts_helper<USE_LDGSTS>::load(this, smem_tile, ptrs, preds);
+  }
+
+  // Move the pointer to the next row location.
+  inline __device__ void move(int const steps = 1) {
+    q_k_v_ptr_ += (int64_t)ROWS * params_q_k_v_stride_in_bytes_ * steps;
+    actual_seqlen_ -= (int)ROWS * steps;
+  }
+
+  // Move the pointer to the next row location by the offset (not step).
+  inline __device__ void move_by_offset(int const offset) {
+    q_k_v_ptr_ = q_k_v_ptr_init_ + (int64_t)offset * params_q_k_v_stride_in_bytes_;
+    actual_seqlen_ = actual_seqlen_init_ - (int)offset;
+  }
+
+  // Move the pointer to the next column location
+  inline __device__ void move_col() {
+    q_k_v_ptr_ += (int64_t)COLS * (BITS_PER_ELEMENT / 8);
+    // Update col_in_bytes_ to ensure load predicates work
+    col_in_bytes_ += THREADS_PER_ROW * BYTES_PER_LDG;
+  }
+
+  // Rewind the pointer back to previous column location
+  inline __device__ void rewind_col(int const steps) {
+    q_k_v_ptr_ -= COLS * (BITS_PER_ELEMENT / 8) * steps;
+    // Update col_in_bytes_ to ensure load predicates work
+    col_in_bytes_ -= THREADS_PER_ROW * BYTES_PER_LDG * steps;
+  }
+
+  // Move the pointer to the specified step.
+  inline __device__ void move_to(int const step) {
+    q_k_v_ptr_ = q_k_v_ptr_init_ + (int64_t)ROWS * params_q_k_v_stride_in_bytes_ * step;
+    actual_seqlen_ = actual_seqlen_init_ - (int)ROWS * step;
+  }
+
+  inline __device__ void reset() {
+    q_k_v_ptr_ = q_k_v_ptr_init_;
+    actual_seqlen_ = actual_seqlen_init_;
+  }
+
+  // The stride between rows for the Q/K/V matrice.
+  int64_t params_q_k_v_stride_in_bytes_;
+  // The pointer.
+  char* q_k_v_ptr_;
+  char* q_k_v_ptr_init_;
+  // The register to store predicates.
+  uint32_t preds_[PRED_REGS];
+  // The fetch registers.
+  uint4 fetch_[LDGS];
+  // Keep track of the row and col the thread is processing as we move the tile.
+  int row_;
+  int64_t col_in_bytes_;
+  // The sequence length.
+  int actual_seqlen_;
+  int actual_seqlen_init_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Shape [B, S, 2, H, D] where S can be variable sequence length.
+template <
+    // The instruction traits.
+    typename Traits,
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The number of bits per element.
+    int BITS_PER_ELEMENT_,
+    // The number of rows of Q, K or V loaded by this tile.
+    int ROWS_,
+    // The number of columns (padded, e.g 64).
+    int COLS,
+    // The actual number of columns (unpadded, e.g 40)
+    int VALID_COLS_,
+    // Do we use LDGSTS?
+    bool USE_LDGSTS_,
+    // Are attention heads interleaved? (Not used)
+    bool HEADS_INTERLEAVED,
+    // The number of matrices (Not used)
+    int NUM_MATS = 2,
+    // Is sliding window attention used ?
+    bool SLIDING_WINDOW_ATTENTION = false>
+struct Gmem_tile_contiguous_kv {
+  // The size of each LDG.
+  enum { BYTES_PER_LDG = 16 };
+
+  // The number of bits/bytes of element
+  enum { BITS_PER_ELEMENT = BITS_PER_ELEMENT_ };
+
+  enum { BYTES_PER_ELEMENT = BITS_PER_ELEMENT_ / 8 };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = COLS * BITS_PER_ELEMENT / 8 };
+
+  // The number of threads to load a "row" of the matrix.
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDG };
+
+  // The valid size of a row in bytes (without paddings).
+  enum { VALID_COLS = VALID_COLS_ };
+
+  // The amount of bytes that are valid per row.
+  enum { VALID_BYTES_PER_ROW = VALID_COLS * BITS_PER_ELEMENT / 8 };
+
+  // The number of "rows" loaded per LDG.
+  enum { ROWS_PER_LDG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // The number of rows.
+  enum { ROWS = ROWS_ };
+
+  // The number of LDGs needed to load a chunk of the Q matrix.
+  enum { LDGS = fmha::Div_up<ROWS, ROWS_PER_LDG>::VALUE };
+
+  // The number of predicate registers.
+  enum { PRED_REGS = fmha::Compute_number_of_pred_regs<LDGS>::VALUE };
+
+  // Is it Hopper?
+  enum {
+    IS_HOPPER = std::is_same<typename Traits::Gpu_arch, typename fmha::Hopper>::value == true
+  };
+
+  // Make sure we use a single register to store predicates. Do not throw for Hopper for now.
+  static_assert(!USE_LDGSTS_ || PRED_REGS == 1 || IS_HOPPER, "");
+
+  // We do not use LDGSTS (for the moment).
+  enum { USE_LDGSTS = USE_LDGSTS_ };
+
+  // Ctor for bert::Fused_multihead_attention_params_v2 class
+  template <typename Block_info>
+  inline __device__ Gmem_tile_contiguous_kv(bert::Fused_multihead_attention_params_v2 const& params,
+                                            int qkv_offset,  // q = 0, k = 1, v = 2.
+                                            Block_info const& binfo, int tidx,
+                                            int cta_row_offset = 0, int cta_col_offset_in_bytes = 0)
+      : Gmem_tile_contiguous_kv(params.kv_ptr, params.k_stride_in_bytes, params.h_kv,
+                                params.h_q_per_kv, qkv_offset, binfo, tidx, cta_row_offset,
+                                cta_col_offset_in_bytes) {}
+
+  // Ctor.
+  template <typename Block_info>
+  inline __device__ Gmem_tile_contiguous_kv(void* kv_ptr, size_t kv_stride_in_bytes,
+                                            int num_kv_heads, int head_group_size, int qkv_offset,
+                                            Block_info const& binfo, int tidx,
+                                            int cta_row_offset = 0, int cta_col_offset_in_bytes = 0)
+      : params_kv_stride_in_bytes_(kv_stride_in_bytes),
+        actual_seqlen_(binfo.actual_kv_seqlen),
+        kv_ptr_(reinterpret_cast<char*>(kv_ptr)) {
+    // Compute the position in the sequence (within the CTA for the moment).
+    int row = tidx / THREADS_PER_ROW;
+    // Compute the position of the thread in the row.
+    int col = tidx % THREADS_PER_ROW;
+
+    // We must store the value to update the predicates in "load".
+    row_ = row;
+    // Do not load/store if the thread is in the padded area
+    col_in_bytes_ = cta_col_offset_in_bytes + col * BYTES_PER_LDG;
+
+    // The row offset in the batched GEMM.
+    int64_t row_offset = (int64_t)(row + cta_row_offset) * params_kv_stride_in_bytes_;
+    // [b, s, 2, h_kv, d].
+    int64_t idx =
+        (binfo.sum_s_kv * 2 + qkv_offset - 1) * num_kv_heads + (binfo.bidh / head_group_size);
+
+    // Assemble the final pointer.
+    kv_ptr_ += row_offset + idx * VALID_BYTES_PER_ROW + col_in_bytes_;
+
+    // Take the CTA offset to modify the sequence length.
+    actual_seqlen_ -= cta_row_offset;
+
+    // Set the initial seq_len and qkv_offset in case of reinterating
+    actual_seqlen_init_ = actual_seqlen_;
+    kv_ptr_init_ = kv_ptr_;
+  }
+
+  // Store data to shared memory.
+  template <typename Smem_tile>
+  inline __device__ void commit(Smem_tile& smem_tile) {
+    if (!USE_LDGSTS) {
+      smem_tile.store(fetch_);
+    }
+  }
+
+  // Load data from memory.
+  template <typename Smem_tile>
+  inline __device__ void load(Smem_tile& smem_tile) {
+    uint32_t preds[LDGS];
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      preds[ii] = row_ + ii * (int)ROWS_PER_LDG < min((int)ROWS, actual_seqlen_);
+      preds[ii] &= col_in_bytes_ < VALID_BYTES_PER_ROW;
+    }
+
+    // Prepare the load pointers.
+    void const* ptrs[LDGS];
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      ptrs[ii] = kv_ptr_ + (int64_t)ii * ROWS_PER_LDG * params_kv_stride_in_bytes_;
+    }
+
+    // Trigger LDGSTS or the LDGs.
+    // The predicates protect against out-of-bound access in rows and cols
+    Ldgsts_helper<USE_LDGSTS>::load(this, smem_tile, ptrs, preds);
+  }
+
+  // Load data from memory.
+  inline __device__ void load() {
+    uint32_t preds[LDGS];
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      preds[ii] = row_ + ii * (int)ROWS_PER_LDG < min((int)ROWS, actual_seqlen_);
+    }
+
+    // Prepare the load pointers.
+    void const* ptrs[LDGS];
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      ptrs[ii] = kv_ptr_ + (int64_t)ii * ROWS_PER_LDG * params_kv_stride_in_bytes_;
+    }
+
+    // Trigger the LDGs.
+    if (col_in_bytes_ < VALID_BYTES_PER_ROW) {
+      fmha::pack_predicates(preds_, preds);
+      fmha::ldg(fetch_, ptrs, preds_);
+    } else {
+#pragma unroll
+      for (int ii = 0; ii < LDGS; ++ii) {
+        fetch_[ii] = make_uint4(0u, 0u, 0u, 0u);
+      }
+    }
+  }
+
+  // Move the pointer to the next row location.
+  inline __device__ void move(int const steps = 1) {
+    kv_ptr_ += (int64_t)ROWS * params_kv_stride_in_bytes_ * steps;
+    actual_seqlen_ -= (int)ROWS * steps;
+  }
+
+  // Move the pointer to the next row location by the offset (not step).
+  inline __device__ void move_by_offset(int const offset) {
+    kv_ptr_ = kv_ptr_init_ + (int64_t)offset * params_kv_stride_in_bytes_;
+    actual_seqlen_ = actual_seqlen_init_ - (int)offset;
+  }
+
+  // Move the pointer to the next column location
+  inline __device__ void move_col(int const steps = 1) {
+    kv_ptr_ += (int64_t)COLS * (BITS_PER_ELEMENT / 8) * steps;
+    // Update col_in_bytes_ to ensure load predicates work
+    col_in_bytes_ += THREADS_PER_ROW * BYTES_PER_LDG * steps;
+  }
+
+  inline __device__ void reset() {
+    kv_ptr_ = kv_ptr_init_;
+    actual_seqlen_ = actual_seqlen_init_;
+  }
+
+  // Rewind the pointer back to previous column location
+  inline __device__ void rewind_col(int const steps) {
+    kv_ptr_ -= COLS * (BITS_PER_ELEMENT / 8) * steps;
+    // Update col_in_bytes_ to ensure load predicates work
+    col_in_bytes_ -= THREADS_PER_ROW * BYTES_PER_LDG * steps;
+  }
+
+  inline __device__ void move_to(int const step) {
+    kv_ptr_ = kv_ptr_init_ + (int64_t)ROWS * params_kv_stride_in_bytes_ * step;
+    actual_seqlen_ = actual_seqlen_init_ - (int)ROWS * step;
+  }
+
+  // Store data to memory.
+  inline __device__ void store(uint4 const (&data)[LDGS]) {
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      char* ptr = kv_ptr_ + (int64_t)ii * ROWS_PER_LDG * params_kv_stride_in_bytes_;
+      if (((row_ + ii * ROWS_PER_LDG) < min(ROWS, actual_seqlen_)) &&
+          col_in_bytes_ < VALID_BYTES_PER_ROW /*TODO: double check*/) {
+        fmha::stg(ptr, data[ii]);
+      }
+    }
+  }
+
+  // The stride between rows for the QKV matrice.
+  int64_t params_kv_stride_in_bytes_;
+  // The pointer.
+  char* kv_ptr_;
+  char* kv_ptr_init_;
+  // The register to store predicates.
+  uint32_t preds_[PRED_REGS];
+  // The fetch registers.
+  uint4 fetch_[LDGS];
+  // Keep track of the row and col the thread is processing as we move the tile.
+  int row_;
+  int col_in_bytes_;
+  // The sequence length.
+  int actual_seqlen_;
+  int actual_seqlen_init_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// We expect the paged KV layout to be blocks of indices with shape of [B, 2, Blocks_per_Seq],
+// and the indice tells the memory distance to the pool ptr in global memory.
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The number of bits per element.
+    int BITS_PER_ELEMENT_,
+    // The number of rows of Q, K or V loaded by this tile.
+    int ROWS_,
+    // The number of columns (padded, e.g 64).
+    int COLS,
+    // The actual number of columns (unpadded, e.g 40)
+    int VALID_COLS_,
+    // Do we use LDGSTS?
+    bool USE_LDGSTS_,
+    // Are attention heads interleaved? (not used)
+    bool HEADS_INTERLEAVED = false,
+    // The number of matrices (not used)
+    int NUM_MATS = 2,
+    // Is sliding window attention used ?
+    bool SLIDING_WINDOW_ATTENTION_ = false>
+struct Gmem_tile_paged_kv {
+  // The size of each LDG.
+  enum { BYTES_PER_LDG = 16 };
+
+  // The number of bits/bytes of element
+  enum { BITS_PER_ELEMENT = BITS_PER_ELEMENT_ };
+
+  enum { BYTES_PER_ELEMENT = BITS_PER_ELEMENT_ / 8 };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = COLS * BITS_PER_ELEMENT / 8 };
+
+  // The number of threads to load a "row" of the matrix.
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDG };
+
+  // The valid size of a row in bytes (without paddings).
+  enum { VALID_COLS = VALID_COLS_ };
+
+  // The amount of bytes that are valid per row.
+  enum { VALID_BYTES_PER_ROW = VALID_COLS * BITS_PER_ELEMENT / 8 };
+
+  // The number of "rows" loaded per LDG.
+  enum { ROWS_PER_LDG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // The number of rows.
+  enum { ROWS = ROWS_ };
+
+  // The number of LDGs needed to load a chunk of the Q matrix.
+  enum { LDGS = fmha::Div_up<ROWS, ROWS_PER_LDG>::VALUE };
+
+  // The number of predicate registers.
+  enum { PRED_REGS = fmha::Compute_number_of_pred_regs<LDGS>::VALUE };
+
+  // Is sliding window attention used ?
+  enum { SLIDING_WINDOW_ATTENTION = SLIDING_WINDOW_ATTENTION_ };
+
+  // Is it Hopper?
+  enum {
+    IS_HOPPER = std::is_same<typename Traits::Gpu_arch, typename fmha::Hopper>::value == true
+  };
+
+  // Make sure we use a single register to store predicates. Do not throw for Hopper for now.
+  static_assert(!USE_LDGSTS_ || PRED_REGS == 1 || IS_HOPPER, "");
+
+  // We do not use LDGSTS (for the moment).
+  enum { USE_LDGSTS = USE_LDGSTS_ };
+
+  // Ctor.
+  template <typename Block_info>
+  inline __device__ Gmem_tile_paged_kv(bert::Fused_multihead_attention_params_v2 const& params,
+                                       int qkv_offset,  // q = 0, k = 1, v = 2.
+                                       Block_info const& binfo, int tidx, int cta_row_offset = 0,
+                                       int cta_col_offset_in_bytes = 0)
+      : actual_seqlen_(binfo.actual_seqlen),
+        past_seqlen_(binfo.actual_seqlen - binfo.actual_q_seqlen),
+        sliding_window_size_(params.sliding_window_size),
+        paged_kv_log2_block_size_(params.paged_kv_cache.mTokensPerBlockLog2),
+        paged_kv_block_pool_ptr_(reinterpret_cast<char*>(params.paged_kv_cache.mPoolPtr)),
+        paged_kv_global_block_offsets_(params.paged_kv_cache.mBlockOffsets),
+        params_kv_block_size_in_bytes_(params.paged_kv_cache.mBytesPerBlock) {
+    // Handle Paged KV with shape [S, Dh], by offsetting it to the target batch.
+    int32_t const paged_kv_block_offset =
+        (binfo.bidb * 2 + qkv_offset - 1) * params.paged_kv_cache.mMaxBlocksPerSeq;
+    paged_kv_global_block_offsets_ += paged_kv_block_offset;
+
+    // Compute the position in the sequence (within the CTA for the moment).
+    int row = tidx / THREADS_PER_ROW;
+    // Compute the position of the thread in the row.
+    int col = tidx % THREADS_PER_ROW;
+
+    // We must store the value to update the predicates in "load".
+    row_ = row;
+    // Do not load/store if the thread is in the padded area
+    col_in_bytes_ = cta_col_offset_in_bytes + col * BYTES_PER_LDG;
+
+    int64_t kv_stride_in_bytes =
+        qkv_offset == 1 ? params.k_stride_in_bytes : params.v_stride_in_bytes;
+    // The head offset.
+    head_stride_in_bytes_ = (int64_t)(binfo.bidh / params.h_q_per_kv) * kv_stride_in_bytes;
+    // When V is padded (like MLA), we cannot use VALID_BYTES_PER_ROW
+    token_stride_in_bytes_ = kv_stride_in_bytes >> paged_kv_log2_block_size_;
+
+    // Take the CTA offset to modify the sequence length.
+    // Actually we don't need that for flash attention.
+    actual_seqlen_ -= cta_row_offset;
+  }
+
+  // Store data to shared memory.
+  template <typename Smem_tile>
+  inline __device__ void commit(Smem_tile& smem_tile) {
+    if (!USE_LDGSTS) {
+      smem_tile.store(fetch_);
+    }
+  }
+
+  // Load data from memory.
+  template <typename Smem_tile>
+  inline __device__ void load(Smem_tile& smem_tile) {
+    // Prepare the predicates.
+    uint32_t preds[LDGS];
+    // Prepare the load pointers.
+    void const* ptrs[LDGS];
+
+    // Offset for the new paged kv pointer.
+    uint64_t const head_col_in_bytes = head_stride_in_bytes_ + col_in_bytes_;
+
+// Update paged_kv ptr for each LDG (reuse is possible).
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      int row_idx = row_ + ii * (int)ROWS_PER_LDG;
+      int paged_kv_block_idx = (row_idx >> paged_kv_log2_block_size_);
+      char const* local_kv_ptr = reinterpret_cast<char*>(
+          paged_kv_block_pool_ptr_ +
+          params_kv_block_size_in_bytes_ * paged_kv_global_block_offsets_[paged_kv_block_idx]);
+
+      // Predicates.
+      // TODO: do we need to make sure row_idx < ROWS ?
+      preds[ii] = row_idx < actual_seqlen_;
+      preds[ii] &= col_in_bytes_ < VALID_BYTES_PER_ROW;
+
+      // Pointers.
+      int row_idx_in_block = row_idx & ((1 << paged_kv_log2_block_size_) - 1);
+      ptrs[ii] =
+          local_kv_ptr + head_col_in_bytes + (int64_t)row_idx_in_block * token_stride_in_bytes_;
+    }
+
+    // Trigger LDGSTS or the LDGs.
+    // The predicates protect against out-of-bound access in rows and cols
+    Ldgsts_helper<USE_LDGSTS>::load(this, smem_tile, ptrs, preds);
+  }
+
+  // Move the pointer to the next row location.
+  inline __device__ void move() { row_ += ROWS; }
+
+  // Move the pointer to the next row location by the offset (not step).
+  inline __device__ void move_by_offset(int const offset) { row_ += offset; }
+
+  // Move the pointer to the next column location
+  inline __device__ void move_col() { col_in_bytes_ += THREADS_PER_ROW * BYTES_PER_LDG; }
+
+  // Rewind the pointer back to previous column location
+  inline __device__ void rewind_col(int const steps) {
+    // Update col_in_bytes_ to ensure load predicates work
+    col_in_bytes_ -= THREADS_PER_ROW * BYTES_PER_LDG * steps;
+  }
+
+  // The stride between rows for the KV matrice.
+  int64_t params_kv_block_size_in_bytes_;
+  // The paged cache pool pointer.
+  char* paged_kv_block_pool_ptr_;
+  // The paged block offsets.
+  int32_t* paged_kv_global_block_offsets_;
+  // The paged block size.
+  int paged_kv_log2_block_size_;
+  // The register to store predicates.
+  uint32_t preds_[PRED_REGS];
+  // The fetch registers.
+  uint4 fetch_[LDGS];
+  // Keep track of the row and col the thread is processing as we move the tile.
+  int row_;
+  int64_t col_in_bytes_;
+  // Keep track of the head offset.
+  int64_t head_stride_in_bytes_;
+  // // for DeepSeek MLA, the stride of V tokens != VALID_BYTES_PER_ROW
+  int32_t token_stride_in_bytes_;
+  // The sequence length.
+  int actual_seqlen_;
+  // The past sequence length (kv_seqlen - q_seqlen) considering chunked context.
+  int past_seqlen_;
+  // The sliding attention window size.
+  int sliding_window_size_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The number of bits per element.
+    int BITS_PER_ELEMENT,
+    // The number of rows of Q loaded by this tile.
+    int ROWS_,
+    // The number of columns.
+    int COLS,
+    // Do we use LDGSTS?
+    bool USE_LDGSTS_,
+    // Are attention heads interleaved?
+    bool HEADS_INTERLEAVED,
+    // The number of matrices
+    int NUM_MATS = 1>
+struct Gmem_tile_q_kv {
+  // The size of each LDG.
+  enum { BYTES_PER_LDG = 16 };
+
+  // The padded to the next power of 2 number of columns
+  enum { COLS_PADDED = Next_power_of_two<COLS>::VALUE };
+
+  // The padded size of a row in bytes.
+  enum { BYTES_PER_ROW_PADDED = COLS_PADDED * BITS_PER_ELEMENT / 8 };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = COLS * BITS_PER_ELEMENT / 8 };
+
+  // The number of threads to load a padded "row" of the matrix.
+  enum { THREADS_PER_ROW_PADDED = BYTES_PER_ROW_PADDED / BYTES_PER_LDG };
+
+  // The number of threads to load a "row" of the matrix.
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDG };
+
+  // The number of "rows" loaded per LDG.
+  enum { ROWS_PER_LDG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW_PADDED };
+
+  // The number of rows.
+  enum { ROWS = ROWS_ };
+
+  // The number of LDGs needed to load a chunk of the Q matrix.
+  enum { LDGS = fmha::Div_up<ROWS, ROWS_PER_LDG>::VALUE };
+
+  // The number of predicate registers.
+  enum { PRED_REGS = fmha::Compute_number_of_pred_regs<LDGS>::VALUE };
+
+  // Is it Hopper?
+  enum {
+    IS_HOPPER = std::is_same<typename Traits::Gpu_arch, typename fmha::Hopper>::value == true
+  };
+
+  // Make sure we use a single register to store predicates. Do not throw for Hopper for now.
+  static_assert(!USE_LDGSTS_ || PRED_REGS == 1 || IS_HOPPER, "");
+
+  // We do not use LDGSTS (for the moment).
+  enum { USE_LDGSTS = USE_LDGSTS_ };
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_q_kv(Params const& params, int offset, Block_info const& binfo,
+                                   int tidx, int cta_row_offset = 0)
+      : params_stride_in_bytes_(params.stride_in_bytes),
+        actual_seqlen_(binfo.actual_seqlen),
+        ptr_(reinterpret_cast<char*>(params.ptr)) {
+    // Compute the position in the sequence (within the CTA for the moment).
+    int row = tidx / THREADS_PER_ROW_PADDED;
+    // Compute the position of the thread in the row.
+    int col = tidx % THREADS_PER_ROW_PADDED;
+
+    // We must store the value to update the predicates in "load".
+    row_ = row;
+    // Mask for predicate if the channels are in the padded area
+    int const bytes_per_row_non_padded = params.d * BITS_PER_ELEMENT / 8;
+    mask_ = col < bytes_per_row_non_padded / BYTES_PER_LDG;
+
+    // The row offset in the batched GEMM. For each seq element, we store QKV in that order.
+    int64_t row_offset = (int64_t)(row + cta_row_offset) * params.stride_in_bytes;
+    // Add the block index.
+    int64_t idx;
+    if (HEADS_INTERLEAVED) {
+      idx = binfo.bidx * NUM_MATS + offset;
+    } else {
+      idx = (binfo.sum_s * NUM_MATS + offset) * params.h + binfo.bidh;
+    }
+    // Assemble the final pointer.
+    ptr_ += row_offset + idx * bytes_per_row_non_padded + col * BYTES_PER_LDG;
+
+    // Take the CTA offset to modify the sequence length.
+    actual_seqlen_ -= cta_row_offset;
+  }
+
+  // Store data to shared memory.
+  template <typename Smem_tile>
+  inline __device__ void commit(Smem_tile& smem_tile) {
+    if (!USE_LDGSTS) {
+      smem_tile.store(fetch_);
+    }
+  }
+
+  // Load data from memory.
+  template <typename Smem_tile>
+  inline __device__ void load(Smem_tile& smem_tile) {
+    uint32_t preds[LDGS];
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      preds[ii] = (row_ + ii * (int)ROWS_PER_LDG < min((int)ROWS, actual_seqlen_)) && mask_;
+    }
+
+    // Prepare the load pointers.
+    void const* ptrs[LDGS];
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      ptrs[ii] = ptr_ + (int64_t)ii * ROWS_PER_LDG * params_stride_in_bytes_;
+    }
+
+    // Trigger LDGSTS or the LDGs.
+    Ldgsts_helper<USE_LDGSTS>::load(this, smem_tile, ptrs, preds);
+  }
+
+  inline __device__ void move(int const steps = 1) {
+    ptr_ += (int64_t)ROWS * params_stride_in_bytes_ * steps;
+    actual_seqlen_ -= (int)ROWS * steps;
+  }
+
+  // Store data to memory.
+  inline __device__ void store(uint4 const (&data)[LDGS]) {
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      char* ptr = ptr_ + (int64_t)ii * ROWS_PER_LDG * params_stride_in_bytes_;
+      if ((row_ + ii * ROWS_PER_LDG) < min(ROWS, actual_seqlen_)) {
+        fmha::stg(ptr, data[ii]);
+      }
+    }
+  }
+
+  // The stride between rows for the matrix.
+  int64_t params_stride_in_bytes_;
+  // The pointer.
+  char* ptr_;
+  // The register to store predicates.
+  uint32_t preds_[PRED_REGS];
+  // The fetch registers.
+  uint4 fetch_[LDGS];
+  // Keep track of the row and col the thread is processing as we move the tile.
+  int row_;
+  // Keep track of predicate state that depends only on the initialization state.
+  int mask_;
+  // The sequence length.
+  int actual_seqlen_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The number of bits per element.
+    int BITS_PER_ELEMENT,
+    // The number of rows of Q, K or V loaded by this tile.
+    int ROWS_,
+    // The number of columns.
+    int COLS,
+    // Do we use LDGSTS?
+    bool USE_LDGSTS_>
+struct Gmem_tile_qkv_interleaved {
+  // The vectorization width for NC/32HW32.
+  enum { VEC = 32 };
+
+  // The size of each LDG.
+  enum { BYTES_PER_LDG = 16 };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = VEC * BITS_PER_ELEMENT / 8 };
+
+  // DEBUG.
+  static_assert(BYTES_PER_ROW == 32, "");
+
+  // END OF DEBUG.
+
+  // The number of threads to load a "row" of the matrix.
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDG };
+
+  // DEBUG.
+  static_assert(THREADS_PER_ROW == 2, "");
+
+  // END OF DEBUG.
+
+  // The number of "rows" loaded per LDG.
+  enum { ROWS_PER_LDG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // The number of slices. It is either 1 for DIM_PER_HEAD == 32 and 2 for DIM_PER_HEAD == 64.
+  enum { NUM_SLICES = COLS / VEC };
+
+  // DEBUG.
+  static_assert(NUM_SLICES == 1 || NUM_SLICES == 2, "");
+
+  // END OF DEBUG.
+
+  // The number of rows in a slice.
+  enum { ROWS = ROWS_ };
+
+  // The number of LDGs needed to load a chunk of the Q matrix.
+  enum { LDGS = fmha::Div_up<ROWS * NUM_SLICES, ROWS_PER_LDG>::VALUE };
+
+  // The number of predicate registers.
+  enum { PRED_REGS = fmha::Compute_number_of_pred_regs<LDGS>::VALUE };
+
+  // Make sure we use a single register to store predicates.
+  static_assert(PRED_REGS == 1, "");
+
+  // Do we use LDGSTS on Ampere?
+  enum { USE_LDGSTS = USE_LDGSTS_ };
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_qkv_interleaved(Params const& params, int qkv_select,
+                                              Block_info const& block_info, int tidx,
+                                              int cta_row_offset = 0)
+      : actual_seqlen_(block_info.actual_seqlen - cta_row_offset),
+        total_(params.q_stride_in_bytes),
+        kv_ptr_(reinterpret_cast<char const*>(params.qkv_ptr)) {
+    int bidh = block_info.bidh;
+    int sum_s = block_info.sum_s;
+
+    // We must keep track of the row to repack predicates in load.
+    row_ = tidx / THREADS_PER_ROW;
+    // The column.
+    int col = tidx % THREADS_PER_ROW;
+
+    // h is N
+    // d is H
+    // we get the data in as: 3 x h x (d/32) x total x 32 (think 3 x h x (d/32)
+    // x b x s x 32)
+
+    // Loading qkv: ignore slice for now.
+    int qkv_offset = qkv_select * params.h * NUM_SLICES * total_;
+    // bidh * GROUPS * B * S + b * S.
+    int block_offset = bidh * NUM_SLICES * total_ + sum_s;
+    // The row offset.
+    int row_offset = (qkv_offset + block_offset + cta_row_offset) * BYTES_PER_ROW;
+
+    // That's the pointer to load from (see "load").
+    kv_ptr_ += row_offset + col * BYTES_PER_LDG;
+
+    init_actual_seqlen_ = actual_seqlen_;
+    init_kv_ptr_ = kv_ptr_;
+  }
+
+  // Store data to shared memory.
+  template <typename Smem_tile>
+  inline __device__ void commit(Smem_tile& smem_tile) {
+    if (!USE_LDGSTS) {
+      smem_tile.store(fetch_);
+    }
+  }
+
+  // Load data from memory.
+  template <typename Smem_tile>
+  inline __device__ void load(Smem_tile& smem_tile) {
+    void const* ptrs[LDGS];
+    uint32_t preds[LDGS];
+
+// We precompute slice offsets and predicates
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ii++) {
+      // the next row
+      int row_i = row_ + ii * ROWS_PER_LDG;
+
+      // Decompose the current row in slice and original row
+      int slice = row_i / ROWS;
+      // The position in the slice.
+      int row_in_slice = row_i % ROWS;
+
+      // Update the predicate.
+      preds[ii] = row_in_slice < min(actual_seqlen_, ROWS);
+      // Compute the pointer.
+      ptrs[ii] = &kv_ptr_[(slice * total_ + row_in_slice) * BYTES_PER_ROW];
+    }
+
+    // Update the predicate register.
+    fmha::pack_predicates(preds_, preds);
+
+    // Trigger the loads.
+    if (USE_LDGSTS) {
+      smem_tile.store(ptrs, preds_);
+    } else {
+      fmha::ldg(fetch_, ptrs, preds_);
+    }
+  }
+
+  // Move the pointer to the next location.
+  inline __device__ void move(int const steps = 1) {
+    kv_ptr_ += (int64_t)ROWS * BYTES_PER_ROW * steps;
+    actual_seqlen_ -= ROWS * steps;
+  }
+
+  // Reset to the initial location.
+  inline __device__ void reset() {
+    kv_ptr_ = init_kv_ptr_;
+    actual_seqlen_ = init_actual_seqlen_;
+  }
+
+  // The pointer.
+  char const* kv_ptr_;
+  char const* init_kv_ptr_;
+  // The register to store predicates.
+  uint32_t preds_[PRED_REGS];
+  // The fetch registers.
+  uint4 fetch_[LDGS];
+  // keep track of the row the thread is processing as we move the tile
+  int row_;
+  // The sequence length.
+  int actual_seqlen_;
+  int init_actual_seqlen_;
+  // The number of rows per slice??
+  int total_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace v2
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/hopper/arrive_wait.h b/csrc/fmha_v2/fmha/hopper/arrive_wait.h
new file mode 100644
index 0000000000..6448d82607
--- /dev/null
+++ b/csrc/fmha_v2/fmha/hopper/arrive_wait.h
@@ -0,0 +1,396 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+// CP ASYNC FEATURES ///////////////////////////////////////////////////////////////////////////////
+#if !defined(CUDA_CP_ASYNC_SUPPORTED) && \
+    ((__CUDACC_VER_MAJOR__ >= 11) ||     \
+     ((__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ >= 2)))
+#define CUDA_CP_ASYNC_SUPPORTED 1
+#endif
+
+#if !defined(CUDA_CP_ASYNC_ENABLED) && (CUDA_CP_ASYNC_SUPPORTED)
+#define CUDA_CP_ASYNC_ENABLED 1
+#endif
+
+#if CUDA_CP_ASYNC_ENABLED && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+#define CUDA_CP_ASYNC_ACTIVATED 1
+#endif
+
+#if !defined(CUDA_CP_ASYNC_GROUP_POLICY_SUPPORTED) && (CUDA_CP_ASYNC_SUPPORTED) && \
+    (__CUDACC_VER_MAJOR__ >= 11)
+#define CUDA_CP_ASYNC_GROUP_POLICY_SUPPORTED 1
+#endif
+
+#if !defined(CUDA_CP_ASYNC_GROUP_POLICY_ENABLED) && (CUDA_CP_ASYNC_GROUP_POLICY_SUPPORTED)
+#define CUDA_CP_ASYNC_GROUP_POLICY_ENABLED 1
+#endif
+
+#if CUDA_CP_ASYNC_GROUP_POLICY_ENABLED && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+#define CUDA_CP_ASYNC_GROUP_POLICY_ACTIVATED 1
+#endif
+
+#if !defined(CUDA_CP_ASYNC_MBARRIER_ARRIVE_SUPPORTED) && (CUDA_CP_ASYNC_SUPPORTED) && \
+    (__CUDACC_VER_MAJOR__ >= 11)
+#define CUDA_CP_ASYNC_MBARRIER_ARRIVE_SUPPORTED 1
+#endif
+
+#if !defined(CUDA_CP_ASYNC_MBARRIER_ARRIVE_ENABLED) && (CUDA_CP_ASYNC_MBARRIER_ARRIVE_SUPPORTED)
+#define CUDA_CP_ASYNC_MBARRIER_ARRIVE_ENABLED 1
+#endif
+
+#if (CUDA_CP_ASYNC_MBARRIER_ARRIVE_ENABLED) && (__CUDA_ARCH__ >= 800)
+#define CUDA_CP_ASYNC_MBARRIER_ARRIVE_ACTIVATED 1
+#endif
+
+#if (CUDA_CP_ASYNC_MBARRIER_ARRIVE_ACTIVATED) && (CUDACC_VERSION >= 111)
+#define CUDA_CP_ASYNC_MBARRIER_WAIT_ACTIVATED 1
+#endif
+
+#if !defined(FMHA_PTX_MBARRIER_TRYWAIT_NOSLEEP_INTERNAL_SUPPORT_ENABLED)
+#define FMHA_PTX_MBARRIER_TRYWAIT_NOSLEEP_INTERNAL_SUPPORT_ENABLED 0
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+inline __device__ void named_barrier_arrive(uint32_t BARRIER_ID, uint32_t NUM_THREADS) {
+  if (NUM_THREADS > 1) {
+    asm volatile("bar.arrive %0, %1;" : : "r"(BARRIER_ID), "r"(NUM_THREADS));
+  }
+}
+
+inline __device__ void named_barrier_wait(uint32_t BARRIER_ID, uint32_t NUM_THREADS) {
+  if (NUM_THREADS > 1) {
+    asm volatile("bar.sync %0, %1;" ::"r"(BARRIER_ID), "r"(NUM_THREADS));
+  }
+}
+
+// it is executed per thread, i.e., each thread can call and init a barrier.
+// need a bar.sync after using it.
+inline __device__ void bar_create(void* bar_ptr, int init_count) {
+  unsigned smem_ptr = __nvvm_get_smem_pointer(bar_ptr);
+
+  asm volatile(
+      "{\n\t"
+#if CUDA_CP_ASYNC_MBARRIER_ARRIVE_ACTIVATED
+      "mbarrier.init.shared.b64 [%1], %0; \n\t"
+#else
+      ".reg .s32                negCnt, count, expectedCount;\n\t"
+      ".reg .s64                comboCnt; \n\t"
+      "neg.s32                  negCnt, %0;\n\t "
+      "and.b32                  count, negCnt, 0x7fffffff; \n\t"
+      "and.b32                  expectedCount, negCnt, 0x3fffffff; \n\t"
+      "mov.b64                  comboCnt, {expectedCount, count}; \n\t"
+      "st.shared.s64            [%1], comboCnt; \n\t"
+#endif
+      "}"
+      :
+      : "r"(init_count), "r"(smem_ptr));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Arrive_wait {
+ public:
+  inline __device__ Arrive_wait() { bar_base_ = NULL; }
+
+  inline __device__ Arrive_wait(uint64_t* bar_base, int id = 0) {
+    bar_base_ = bar_base;
+    id_ = id;
+  }
+
+  inline __device__ uint64_t* get_bar_addr(int32_t id) {
+    return reinterpret_cast<uint64_t*>(bar_base_ + id);
+  }
+
+  inline __device__ int bar_peek(int id, unsigned int bar_phase) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    uint64_t* bar_ptr = reinterpret_cast<uint64_t*>(bar_base_ + id);
+    unsigned smem_ptr = __nvvm_get_smem_pointer(bar_ptr);
+    uint32_t result32;
+#if FMHA_PTX_MBARRIER_TRYWAIT_NOSLEEP_INTERNAL_SUPPORT_ENABLED
+    asm volatile(
+        "{\n\t"
+        ".reg .pred       P3; \n\t"
+        "mbarrier.try_wait.parity.nosleep.shared.b64 P3, [%1], %2; \n\t"
+        "selp.b32 %0, 1, 0, P3; \n\t"
+        "}"
+        : "=r"(result32)
+        : "r"(smem_ptr), "r"(bar_phase));
+#else
+    // public ptx default heruistic generate SASS equal to with .nosleep in internal ptx
+    asm volatile(
+        "{\n\t"
+        ".reg .pred       P3; \n\t"
+        "mbarrier.try_wait.parity.shared.b64 P3, [%1], %2; \n\t"
+        "selp.b32 %0, 1, 0, P3; \n\t"
+        "}"
+        : "=r"(result32)
+        : "r"(smem_ptr), "r"(bar_phase));
+#endif
+    return result32;
+#else
+    uint64_t* bar_ptr = reinterpret_cast<uint64_t*>(bar_base_ + id);
+    unsigned int output_phase = (bar_ptr[0] >> 63) & 1;
+
+    return output_phase != bar_phase;
+#endif
+  }
+
+  inline __device__ int bar_peek(int id, unsigned int bar_phase, int pred) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    uint64_t* bar_ptr = reinterpret_cast<uint64_t*>(bar_base_ + id);
+    unsigned smem_ptr = __nvvm_get_smem_pointer(bar_ptr);
+    uint32_t result32;
+#if FMHA_PTX_MBARRIER_TRYWAIT_NOSLEEP_INTERNAL_SUPPORT_ENABLED
+    asm volatile(
+        "{\n\t"
+        ".reg .pred       P3; \n\t"
+        ".reg .pred P2;\n\t"
+        "setp.eq.u32 P2, %3, 1;\n\t"
+        "@P2 mbarrier.try_wait.parity.nosleep.shared.b64 P3, [%1], %2; \n\t"
+        "selp.b32 %0, 1, 0, P3; \n\t"
+        "}"
+        : "=r"(result32)
+        : "r"(smem_ptr), "r"(bar_phase), "r"(pred));
+#else
+    // public ptx default heruistic generate SASS equal to with .nosleep in internal ptx
+    asm volatile(
+        "{\n\t"
+        ".reg .pred       P3; \n\t"
+        ".reg .pred P2;\n\t"
+        "setp.eq.u32 P2, %3, 1;\n\t"
+        "@P2 mbarrier.try_wait.parity.shared.b64 P3, [%1], %2; \n\t"
+        "selp.b32 %0, 1, 0, P3; \n\t"
+        "}"
+        : "=r"(result32)
+        : "r"(smem_ptr), "r"(bar_phase), "r"(pred));
+#endif
+    return result32;
+#else
+    uint64_t* bar_ptr = reinterpret_cast<uint64_t*>(bar_base_ + id);
+    unsigned int output_phase = (bar_ptr[0] >> 63) & 1;
+
+    return output_phase != bar_phase;
+#endif
+  }
+
+  inline __device__ void bar_wait(int id, unsigned int bar_phase) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    uint64_t* bar_ptr = reinterpret_cast<uint64_t*>(bar_base_ + id);
+    unsigned smem_ptr = __nvvm_get_smem_pointer(bar_ptr);
+    uint32_t large_val = 0x989680;
+    asm volatile(
+        "{\n\t"
+        ".reg .pred                P3; \n\t"
+        "LAB_WAIT: \n\t"
+        //"mbarrier.try_wait.parity.b64 P3, [%0], %1; \n\t"
+        "mbarrier.try_wait.parity.shared.b64 P3, [%0], %1, %2; \n\t"
+        "@P3                       bra.uni DONE; \n\t"
+        "bra.uni                   LAB_WAIT; \n\t"
+        "DONE: \n\t"
+        "}"
+        :
+        : "r"(smem_ptr), "r"(bar_phase), "r"(large_val));
+#else
+    uint64_t* bar_ptr = reinterpret_cast<uint64_t*>(bar_base_ + id);
+    unsigned smem_ptr = __nvvm_get_smem_pointer(bar_ptr);
+
+    asm volatile(
+        "{\n\t"
+        ".reg .pred                P3; \n\t"
+#ifdef CUDA_CP_ASYNC_MBARRIER_WAIT_ACTIVATED
+        "mbarrier.test_wait.parity.shared.b64  P3, [%0], %1;\n\t"
+#else
+        ".reg .s32                 high, low; \n\t"
+        ".reg .u32                 currentPhase; \n\t"
+        "ld.volatile.shared.v2.s32 { low, high }, [%0]; \n\t"
+        "shr.u32                   currentPhase, high, 31; \n\t"
+        "setp.ne.u32               P3, currentPhase, %1; \n\t"
+#endif
+        "@P3                       bra.uni DONE; \n\t"
+        "LAB_WAIT: \n\t"
+#ifdef CUDA_CP_ASYNC_MBARRIER_WAIT_ACTIVATED
+        "mbarrier.test_wait.parity.shared.b64  P3, [%0], %1;\n\t"
+#else
+        "ld.volatile.shared.v2.s32 { low, high }, [%0]; \n\t"
+        "shr.u32                   currentPhase, high, 31; \n\t"
+        "setp.ne.u32               P3, currentPhase, %1; \n\t"
+#endif
+        "@P3                       bra.uni DONE; \n\t"
+        "bra.uni                   LAB_WAIT; \n\t"
+        "DONE: \n\t"
+        "}"
+        :
+        : "r"(smem_ptr), "r"(bar_phase));
+#endif
+  }
+
+  // Set the expected_transaction_count and add 1 arrive count (1 transaction = 1 Byte)
+  // This PTX maps to SYNCS.ARRIVES.TRANS64.A1TR.
+  inline __device__ void bar_arrive_set_transactioncnt(int id, int expected_copy_bytes) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+
+    uint64_t* bar_ptr = reinterpret_cast<uint64_t*>(bar_base_ + id);
+    unsigned smem_ptr = __nvvm_get_smem_pointer(bar_ptr);
+    asm volatile(
+        "{\n\t"
+        "mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1; \n\t"
+        "}"
+        :
+        : "r"(smem_ptr), "r"(expected_copy_bytes));
+#endif
+  }
+
+  // Set the expected_transaction_count and add 1 arrive count (1 transaction = 1 Byte)
+  // This PTX maps to SYNCS.ARRIVES.TRANS64.A1TR.
+  inline __device__ void bar_arrive_set_transactioncnt(int id, int expected_copy_bytes,
+                                                       uint32_t pred) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+
+    uint64_t* bar_ptr = reinterpret_cast<uint64_t*>(bar_base_ + id);
+    unsigned smem_ptr = __nvvm_get_smem_pointer(bar_ptr);
+    asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.eq.u32 p, %2, 1;\n\t"
+        "@p mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1; \n\t"
+        "}"
+        :
+        : "r"(smem_ptr), "r"(expected_copy_bytes), "r"(pred));
+#endif
+  }
+
+  // Sends barrier arrive notification to DSMEM
+  // Note this uses a slightly different syntax compared to normal arrive
+  // NOTE : Caller has to ensure that set_bar_base_dsmem has been called prior to using this
+  // This is done as a compiler optimizations (since set barrier base is independent)
+  inline __device__ void bar_arrive_dsmem(int const& id) {
+#if CUDA_CP_ASYNC_MBARRIER_ARRIVE_ACTIVATED
+
+    uint64_t* bar_ptr = reinterpret_cast<uint64_t*>(bar_base_ + id);
+    // TODO : check with PTX team on setctarank (currently emitting errors)
+    // asm volatile("{\n\t"
+    //"setctarank.shared.u32 %0, %1, %2;\n\t"
+    //"}"
+    // : "=r"(dst_ptr) : "r"(smem_ptr), "r"(cta_id));
+
+    asm volatile(
+        "{\n\t"
+        "mbarrier.arrive.b64   _, [%0];\n\t"
+        "}"
+        :
+        : "l"(bar_ptr));
+#endif
+  }
+
+  // Just a predicated version of the above function
+  // Manually inlining it - since the compiler generates BRA instructions at the moment
+  // NOTE : Caller has to ensure that set_bar_base_dsmem has been called prior to using this
+  // This is done as a compiler optimizations (since set barrier base is independent)
+  inline __device__ void bar_arrive_dsmem(int const& id, uint32_t const& pred) {
+#if CUDA_CP_ASYNC_MBARRIER_ARRIVE_ACTIVATED
+    asm volatile(
+        "{\n\t"
+        " .reg .pred p;\n\t"
+        " .reg .s64 addr;\n\t"
+        " .reg .b64 tmp;\n\t"
+        "   setp.eq.u32 p, %2, 1;\n\t"
+        "   mul.wide.s32 tmp, %0, 8;\n\t"
+        "   add.s64 addr, tmp, %1;\n\t"
+        "@p mbarrier.arrive.b64   _, [addr];\n\t"
+        "}"
+        :
+        : "r"(id), "l"(bar_base_), "r"(pred));
+#endif
+  }
+
+  // Sets up the base address for arrival with the correct ctaid in cga
+  inline __device__ void set_bar_base_dsmem(uint32_t const& cta_id) {
+    bar_base_ = reinterpret_cast<uint64_t*>(
+        ((unsigned long long int)bar_base_ & 0xFFFFFFFFF0FFFFFFULL) + (cta_id << 24));
+  }
+
+  inline __device__ void bar_arrive_normal(int id, bool flag = true) {
+#if CUDA_CP_ASYNC_ACTIVATED && !(CUDA_CP_ASYNC_MBARRIER_ARRIVE_ACTIVATED)
+    asm("membar.cta;");
+#endif
+
+    // to make distance for the dependence between atoms.arrive and shfl
+    if (flag == true) {
+      uint64_t* bar_ptr = reinterpret_cast<uint64_t*>(bar_base_ + id);
+      unsigned smem_ptr = __nvvm_get_smem_pointer(bar_ptr);
+
+#if CUDA_CP_ASYNC_MBARRIER_ARRIVE_ACTIVATED
+
+      asm volatile(
+          "{\n\t"
+          ".reg .b64 state; \n\t"
+          "mbarrier.arrive.shared.b64   state, [%0];\n\t"
+          "}"
+          :
+          : "r"(smem_ptr));
+
+#elif CUDA_CP_ASYNC_ACTIVATED
+
+      asm volatile(
+          "{\n\t"
+          ".reg .b64  state; \n\t"
+          "atom.shared.arrive.b64       state, [%0];"
+          "}"
+          :
+          : "r"(smem_ptr));
+#endif
+    }
+  }
+
+  inline __device__ void bar_arrive_ldgsts(int id) {
+    uint64_t* bar_ptr = reinterpret_cast<uint64_t*>(bar_base_ + id);
+    unsigned smem_ptr = __nvvm_get_smem_pointer(bar_ptr);
+
+#if CUDA_CP_ASYNC_MBARRIER_ARRIVE_ACTIVATED
+    asm volatile("cp.async.mbarrier.arrive.noinc.shared.b64 [%0];" : : "r"(smem_ptr));
+#elif CUDA_CP_ASYNC_ACTIVATED
+    asm volatile("cp.async.arrive.shared.b64 [%0];" : : "r"(smem_ptr));
+#endif
+  }
+
+  inline __device__ uint64_t* bar_base() { return bar_base_; }
+
+ private:
+  // smem barrier base pointer
+  uint64_t* bar_base_;
+  // barrier id
+  int id_;
+};
+
+// Set the expected_transaction_count and add 1 arrive count (1 transaction = 1 Byte)
+// This PTX maps to SYNCS.ARRIVES.TRANS64.A1TR.
+inline __device__ void bar_arrive_set_transactioncnt(unsigned smem_ptr,
+                                                     unsigned expected_copy_bytes) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  asm volatile(
+      "{\n\t"
+      "mbarrier.arrive.expect_copy.shared.b64 _, [%0], %1; \n\t"
+      "}"
+      :
+      : "r"(smem_ptr), "r"(expected_copy_bytes));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/hopper/compute_tile.h b/csrc/fmha_v2/fmha/hopper/compute_tile.h
new file mode 100644
index 0000000000..e08c36fc7f
--- /dev/null
+++ b/csrc/fmha_v2/fmha/hopper/compute_tile.h
@@ -0,0 +1,503 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+#include <fmha/hopper/fragment.h>
+
+namespace fmha {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, typename Smem_tile_a, typename Smem_tile_b,
+          bool GMMA_A_RF_,  // GMMA A operand coming from RF?
+          bool GMMA_B_RF_   // GMMA B operand coming from RF?
+          >
+struct Compute_tile_with_gmma {};
+
+/*
+compute tile used when both operands are coming from SMEM
+*/
+template <typename Traits, typename Cta_tile, typename Smem_tile_a, typename Smem_tile_b>
+struct Compute_tile_with_gmma<Traits, Cta_tile, Smem_tile_a, Smem_tile_b,
+                              false,  // GMMA A operand coming from SMEM
+                              false   // GMMA B operand coming from SMEM
+                              > {
+  static constexpr int NUM_KBLOCKS = Smem_tile_b::BUFFERS_PER_TILE / Cta_tile::WARPS_K;
+  static_assert(NUM_KBLOCKS * Cta_tile::WARPS_K == Smem_tile_b::BUFFERS_PER_TILE);
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // desc for A and B should have the same strategy
+  static_assert(Smem_tile_a::Gmma_descriptor::GMMA_DESC_SIZE_PER_GROUP ==
+                    Smem_tile_b::Gmma_descriptor::GMMA_DESC_SIZE_PER_GROUP,
+                "GMMA desc for A and B should have the same strategy.");
+
+  // The number of MMAs.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  enum { MMAS_N = Mma_tile::MMAS_N };
+
+  enum { MMAS_K = Mma_tile::MMAS_K };
+
+  // Ctor.
+  inline __device__ Compute_tile_with_gmma() {}
+
+  // Ctor, that helps set the gmma descs to support different buffer index as the start address.
+  inline __device__ Compute_tile_with_gmma(void* a_smem_, void* b_smem_)
+      : Compute_tile_with_gmma(__nvvm_get_smem_pointer(a_smem_), __nvvm_get_smem_pointer(b_smem_)) {
+  }
+
+  inline __device__ Compute_tile_with_gmma(uint32_t a_smem_base, uint32_t b_smem_base)
+      : a_smem_base_(a_smem_base), b_smem_base_(b_smem_base) {
+    // We always start at buffer 0.
+    uint32_t a_smem = a_smem_base_;
+    uint32_t b_smem = b_smem_base_;
+
+#pragma unroll
+    for (int mma_m_idx = 0; mma_m_idx < MMAS_M; ++mma_m_idx) {
+      gmma_desc_a_[mma_m_idx].set_smem_pointer(a_smem +
+                                               mma_m_idx * Smem_tile_a::GMMA_GROUP_SMEM_DISTANCE);
+      // We take the number of buffers directly from the Smem_tile. If we have only one buffer, the
+      // return offset is 0.
+      gmma_desc_a_[mma_m_idx].set_max_descriptor_0(Smem_tile_a::BYTES_PER_BUFFER_NO_4LSB *
+                                                   (Smem_tile_a::BUFFERS_PER_TILE - 1));
+    }
+
+#pragma unroll
+    for (int mma_n_idx = 0; mma_n_idx < MMAS_N; ++mma_n_idx) {
+      gmma_desc_b_[mma_n_idx].set_smem_pointer(b_smem +
+                                               mma_n_idx * Smem_tile_b::GMMA_GROUP_SMEM_DISTANCE);
+      gmma_desc_b_[mma_n_idx].set_max_descriptor_0(Smem_tile_b::BYTES_PER_BUFFER_NO_4LSB *
+                                                   (Smem_tile_b::BUFFERS_PER_TILE - 1));
+    }
+  }
+
+  // move the gmme desc by N buffers.
+  //  Something nice to have if we have persistent kernels.
+  inline __device__ void increment_N_gmma_desc_group(int N) {
+#pragma unroll
+    for (int idx = 0; idx < Smem_tile_a::Gmma_descriptor::NUM_DESCRIPTORS; ++idx) {
+#pragma unroll
+      for (int mma_m_idx = 0; mma_m_idx < MMAS_M; ++mma_m_idx) {
+        uint64_t temp_desc = gmma_desc_a_[mma_m_idx].get_descriptor(idx);
+        int2& tmp = reinterpret_cast<int2&>(temp_desc);
+        tmp.x = (tmp.x & 0xFFFF0000) + (a_smem_base_ / 16) +
+                mma_m_idx * Smem_tile_a::GMMA_GROUP_SMEM_DISTANCE / 16 +
+                N * Smem_tile_a::BYTES_PER_BUFFER_NO_4LSB;
+        gmma_desc_a_[mma_m_idx].set_descriptor(idx, temp_desc);
+      }
+
+#pragma unroll
+      for (int mma_n_idx = 0; mma_n_idx < MMAS_N; ++mma_n_idx) {
+        uint64_t temp_desc = gmma_desc_b_[mma_n_idx].get_descriptor(idx);
+        int2& tmp = reinterpret_cast<int2&>(temp_desc);
+        tmp.x =
+            (tmp.x & 0xFFFF0000) + (b_smem_base_ / 16) + N * Smem_tile_b::BYTES_PER_BUFFER_NO_4LSB;
+        gmma_desc_b_[mma_n_idx].set_descriptor(idx, temp_desc);
+      }
+    }
+  }
+
+  // Clear the accumulators. It does nothing as we have a special flag for GMMA.
+  inline __device__ void clear() { fmha::clear(acc_); }
+
+  // smarter way of increment a group of gmma desc.
+  // if one of them need to be reset to the first ldgsts buffer
+  // it is very likely (currently guaranteed) that all of them need to be reset to the first
+  // ldgsts buffer.
+  // we do this to save the usage of uniform register. Otherwise, kernel with larger M could not
+  // achieve sol.
+  inline __device__ void increment_gmma_desc_group() {
+    bool reset_buffer_a =
+        gmma_desc_a_[0].get_descriptor(0) >= gmma_desc_a_[0].get_max_descriptor_0();
+    bool reset_buffer_b =
+        gmma_desc_b_[0].get_descriptor(0) >= gmma_desc_b_[0].get_max_descriptor_0();
+
+#pragma unroll
+    for (int idx = 0; idx < Smem_tile_a::Gmma_descriptor::NUM_DESCRIPTORS; ++idx) {
+#pragma unroll
+      for (int mma_m_idx = 0; mma_m_idx < MMAS_M; ++mma_m_idx) {
+        uint64_t temp_desc = gmma_desc_a_[mma_m_idx].get_descriptor(idx);
+        // smem start address is in lower 32bits
+        int2& tmp = reinterpret_cast<int2&>(temp_desc);
+        if (reset_buffer_a) {
+          tmp.x -= (Smem_tile_a::BUFFERS_PER_TILE - 1) * Smem_tile_a::BYTES_PER_BUFFER_NO_4LSB;
+        } else {
+          tmp.x += Smem_tile_a::BYTES_PER_BUFFER_NO_4LSB;
+        }
+
+        gmma_desc_a_[mma_m_idx].set_descriptor(idx, temp_desc);
+      }
+
+#pragma unroll
+      for (int mma_n_idx = 0; mma_n_idx < MMAS_N; ++mma_n_idx) {
+        uint64_t temp_desc = gmma_desc_b_[mma_n_idx].get_descriptor(idx);
+        int2& tmp = reinterpret_cast<int2&>(temp_desc);
+        if (reset_buffer_b) {
+          tmp.x -= (Smem_tile_b::BUFFERS_PER_TILE - 1) * Smem_tile_b::BYTES_PER_BUFFER_NO_4LSB;
+        } else {
+          tmp.x += Smem_tile_b::BYTES_PER_BUFFER_NO_4LSB;
+        }
+        gmma_desc_b_[mma_n_idx].set_descriptor(idx, temp_desc);
+      }
+    }
+  }
+
+  // smarter way of increment a group of gmma desc.
+  // if one of them need to be reset to the first ldgsts buffer
+  // it is very likely (currently guaranteed) that all of them need to be reset to the first
+  // ldgsts buffer.
+  // we do this to save the usage of uniform register. Otherwise, kernel with larger M could not
+  // achieve sol.
+  inline __device__ void increment_gmma_desc_a_group() {
+    bool reset_buffer = gmma_desc_a_[0].get_descriptor(0) >= gmma_desc_a_[0].get_max_descriptor_0();
+
+#pragma unroll
+    for (int idx = 0; idx < Smem_tile_b::Gmma_descriptor::NUM_DESCRIPTORS; ++idx) {
+#pragma unroll
+      for (int mma_m_idx = 0; mma_m_idx < MMAS_M; ++mma_m_idx) {
+        uint64_t temp_desc = gmma_desc_a_[mma_m_idx].get_descriptor(idx);
+        // smem start address is in lower 32bits
+        int2& tmp = reinterpret_cast<int2&>(temp_desc);
+        if (reset_buffer) {
+          tmp.x -= (Smem_tile_a::BUFFERS_PER_TILE - 1) * Smem_tile_a::BYTES_PER_BUFFER_NO_4LSB;
+        } else {
+          tmp.x += Smem_tile_a::BYTES_PER_BUFFER_NO_4LSB;
+        }
+        gmma_desc_a_[mma_m_idx].set_descriptor(idx, temp_desc);
+      }
+    }
+  }
+
+  // smarter way of increment a group of gmma desc.
+  // if one of them need to be reset to the first ldgsts buffer
+  // it is very likely (currently guaranteed) that all of them need to be reset to the first
+  // ldgsts buffer.
+  // we do this to save the usage of uniform register. Otherwise, kernel with larger M could not
+  // achieve sol.
+  template <bool RESET_CHECK = true>
+  inline __device__ void increment_gmma_desc_b_group(int N = 1) {
+    bool reset_buffer =
+        RESET_CHECK && gmma_desc_b_[0].get_descriptor(0) >= gmma_desc_b_[0].get_max_descriptor_0();
+
+#pragma unroll
+    for (int idx = 0; idx < Smem_tile_b::Gmma_descriptor::NUM_DESCRIPTORS; ++idx) {
+#pragma unroll
+      for (int mma_n_idx = 0; mma_n_idx < MMAS_N; ++mma_n_idx) {
+        uint64_t temp_desc = gmma_desc_b_[mma_n_idx].get_descriptor(idx);
+        int2& tmp = reinterpret_cast<int2&>(temp_desc);
+        if (reset_buffer) {
+          tmp.x -= (Smem_tile_b::BUFFERS_PER_TILE - 1) * Smem_tile_b::BYTES_PER_BUFFER_NO_4LSB;
+        } else {
+          tmp.x += Smem_tile_b::BYTES_PER_BUFFER_NO_4LSB;
+        }
+        gmma_desc_b_[mma_n_idx].set_descriptor(idx, temp_desc);
+      }
+    }
+  }
+
+  // Compute.
+  // last of group indicates it is the last GMMA with a GMMA group. So the GSB should be updated
+  // last of kblock indicates it is the last GMMA with kblock. so desc will be updated accordingly
+  inline __device__ void compute(int ki, bool last_of_group = false, bool last_of_kblock = false) {
+#pragma unroll
+    for (int mmas_m_idx = 0; mmas_m_idx < MMAS_M; ++mmas_m_idx) {
+#pragma unroll
+      for (int mmas_n_idx = 0; mmas_n_idx < MMAS_N; ++mmas_n_idx) {
+        // weird code to use SEL to avoid reg spill
+        typename Smem_tile_a::Gmma_descriptor::Single_desc single_desc_a;
+        typename Smem_tile_b::Gmma_descriptor::Single_desc single_desc_b;
+
+        single_desc_a.set(gmma_desc_a_[mmas_m_idx].get_descriptor(ki));
+        single_desc_b.set(gmma_desc_b_[mmas_n_idx].get_descriptor(ki));
+
+        if (mmas_n_idx == (MMAS_N - 1)) {
+          // update desc for A
+          gmma_desc_a_[mmas_m_idx].increment_single_descriptor(last_of_kblock);
+        }
+        if (mmas_m_idx == (MMAS_M - 1)) {
+          // update desc for B
+          gmma_desc_b_[mmas_n_idx].increment_single_descriptor(last_of_kblock);
+        }
+
+        if ((last_of_group == true) && (mmas_m_idx == (MMAS_M - 1)) &&
+            (mmas_n_idx == (MMAS_N - 1))) {
+          // increment the scoreboard
+          acc_[mmas_m_idx][mmas_n_idx].template mma<true>(single_desc_a, single_desc_b);
+        } else {
+          acc_[mmas_m_idx][mmas_n_idx].template mma<false>(single_desc_a, single_desc_b);
+        }
+      }  // for (mmas_n_idx)
+    }  // for (mmas_m_idx)
+  }
+
+  // Load from shared memory. For GMMA where both operand comes from SMEM, this does nothing
+  inline __device__ void load(Smem_tile_a& smem_a, Smem_tile_b& smem_b, int ki,
+                              bool first = false) {}
+
+  // The accumulators.
+  Fragment_accumulator<Traits> acc_[MMAS_M][MMAS_N];
+
+  // one descriptor group per stage, different GMMAs may or maynot share descriptor group
+  // each descriptor group holds all the descriptors for the entire kblock
+
+  // The descriptor to load A.
+  typename Smem_tile_a::Gmma_descriptor gmma_desc_a_[MMAS_M];
+  // The descriptor to load B.
+  typename Smem_tile_b::Gmma_descriptor gmma_desc_b_[MMAS_N];
+  uint32_t a_smem_base_, b_smem_base_;
+};
+
+/*
+compute tile used when A is from RF, B is from SMEM
+*/
+template <typename Traits, typename Cta_tile, typename Smem_tile_a, typename Smem_tile_b>
+struct Compute_tile_with_gmma<Traits, Cta_tile, Smem_tile_a, Smem_tile_b,
+                              true,  // GMMA A operand coming from RF
+                              false  // GMMA B operand coming from SMEM
+                              > {
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The fragment for holding A.
+  using Fragment = Fragment_a<Traits, Row>;
+
+  // static_assert(Cta_tile::K == 128);
+  // static_assert(Mma_tile::K_PER_MMA_PER_CTA == 64 );
+  // pstatic_assert(NUM_KBLOCKS == 384 / 64);
+  static constexpr int NUM_KBLOCKS = Smem_tile_b::BUFFERS_PER_TILE / Cta_tile::WARPS_K;
+  // static_assert(NUM_KBLOCKS * Cta_tile::WARPS_K == Smem_tile_b::BUFFERS_PER_TILE);
+
+  // desc for A and B should have the same strategy
+  static_assert(Smem_tile_a::Gmma_descriptor::GMMA_DESC_SIZE_PER_GROUP ==
+                    Smem_tile_b::Gmma_descriptor::GMMA_DESC_SIZE_PER_GROUP,
+                "GMMA desc for A and B should have the same strategy.");
+
+  // The number of MMAs.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  enum { MMAS_N = Mma_tile::MMAS_N };
+
+  // TODO
+  enum { MMAS_K = Mma_tile::MMAS_K * Cta_tile::WARPS_K };
+
+  // Ctor.
+  inline __device__ Compute_tile_with_gmma() {}
+
+  // Ctor, that helps set the gmma descs
+  inline __device__ Compute_tile_with_gmma(void* a_smem_, void* b_smem_)
+      : Compute_tile_with_gmma(__nvvm_get_smem_pointer(a_smem_), __nvvm_get_smem_pointer(b_smem_)) {
+  }
+
+  inline __device__ Compute_tile_with_gmma(uint32_t, uint32_t b_smem_base)
+      : b_smem_base_(b_smem_base) {
+    // We always start at buffer 0 and take the number of buffers from the Smem_tile, as above.
+    uint32_t b_smem = b_smem_base_;
+// do not need to set desc for matrix A
+#pragma unroll
+    for (int mma_n_idx = 0; mma_n_idx < MMAS_N; ++mma_n_idx) {
+      gmma_desc_b_[mma_n_idx].set_smem_pointer(b_smem +
+                                               mma_n_idx * Smem_tile_b::GMMA_GROUP_SMEM_DISTANCE);
+      gmma_desc_b_[mma_n_idx].set_max_descriptor_0(Smem_tile_b::BYTES_PER_BUFFER_NO_4LSB *
+                                                   (Smem_tile_b::BUFFERS_PER_TILE - 1));
+    }
+  }
+
+  // move the gmme desc by N buffers.
+  //  Something nice to have if we have persistent kernels.
+  inline __device__ void increment_N_gmma_desc_group(int N) {
+#pragma unroll
+    for (int idx = 0; idx < Smem_tile_b::Gmma_descriptor::NUM_DESCRIPTORS; ++idx) {
+#pragma unroll
+      for (int mma_n_idx = 0; mma_n_idx < MMAS_N; ++mma_n_idx) {
+        uint64_t temp_desc = gmma_desc_b_[mma_n_idx].get_descriptor(idx);
+        int2& tmp = reinterpret_cast<int2&>(temp_desc);
+        tmp.x =
+            (tmp.x & 0xFFFF0000) + (b_smem_base_ / 16) + (N)*Smem_tile_b::BYTES_PER_BUFFER_NO_4LSB;
+        gmma_desc_b_[mma_n_idx].set_descriptor(idx, temp_desc);
+      }
+    }
+  }
+
+  // Clear the accumulators. It does nothing as we have a special flag for GMMA.
+  inline __device__ void clear() { fmha::clear(acc_); }
+
+  // smarter way of increment a group of gmma desc.
+  // if one of them need to be reset to the first ldgsts buffer
+  // it is very likely (currently guaranteed) that all of them need to be reset to the first
+  // ldgsts buffer.
+  // we do this to save the usage of uniform register. Otherwise, kernel with larger M could not
+  // achieve sol.
+
+  template <bool RESET_CHECK = true>
+  inline __device__ void increment_gmma_desc_group(int N = 1) {
+    bool reset_buffer =
+        RESET_CHECK && gmma_desc_b_[0].get_descriptor(0) >= gmma_desc_b_[0].get_max_descriptor_0();
+
+#pragma unroll
+    for (int idx = 0; idx < Smem_tile_b::Gmma_descriptor::NUM_DESCRIPTORS; ++idx) {
+#pragma unroll
+      for (int mma_n_idx = 0; mma_n_idx < MMAS_N; ++mma_n_idx) {
+        uint64_t temp_desc = gmma_desc_b_[mma_n_idx].get_descriptor(idx);
+        int2& tmp = reinterpret_cast<int2&>(temp_desc);
+        if (reset_buffer) {
+          tmp.x -= (Smem_tile_b::BUFFERS_PER_TILE - 1) * Smem_tile_b::BYTES_PER_BUFFER_NO_4LSB;
+        } else {
+          tmp.x += Smem_tile_b::BYTES_PER_BUFFER_NO_4LSB;
+        }
+        gmma_desc_b_[mma_n_idx].set_descriptor(idx, temp_desc);
+      }
+    }
+  }
+
+  // Compute.
+  // last of group indicates it is the last GMMA with a GMMA group. So the GSB should be updated
+  // last of kblock indicates it is the last GMMA with kblock. so desc will be updated accordingly
+  inline __device__ void compute(int ki, bool last_of_group = false, bool last_of_kblock = false) {
+#pragma unroll
+    for (int mmas_m_idx = 0; mmas_m_idx < MMAS_M; ++mmas_m_idx) {
+#pragma unroll
+      for (int mmas_n_idx = 0; mmas_n_idx < MMAS_N; ++mmas_n_idx) {
+        // weird code to use SEL to avoid reg spill
+        typename Smem_tile_b::Gmma_descriptor::Single_desc single_desc_b;
+
+        single_desc_b.set(gmma_desc_b_[mmas_n_idx].get_descriptor(ki));
+
+        if (mmas_m_idx == (MMAS_M - 1)) {
+          // update desc for B
+          gmma_desc_b_[mmas_n_idx].increment_single_descriptor(last_of_kblock);
+        }
+
+        if ((last_of_group == true) && (mmas_m_idx == (MMAS_M - 1)) &&
+            (mmas_n_idx == (MMAS_N - 1))) {
+          // increment the scoreboard
+          acc_[mmas_m_idx][mmas_n_idx].template mma<true>(a_[mmas_m_idx], single_desc_b);
+        } else {
+          acc_[mmas_m_idx][mmas_n_idx].template mma<false>(a_[mmas_m_idx], single_desc_b);
+        }
+      }  // for (mmas_n_idx)
+    }  // for (mmas_m_idx)
+  }
+
+  template <int K>
+  inline __device__ void compute_incta_splitk(Fragment const (&frag_a)[K][1], int const warp_k) {
+    if (Smem_tile_b::Gmma_descriptor::TRANS_MODE == Gmma_descriptor_transpose::NOTRANS) {
+      // In this case, the K dimension is the leading dimension, so we need to set the smem
+      // locations correctly for each Warp in K.
+
+      // The number of elements in K per group.
+      constexpr int ELTS_PER_KGROUP = Smem_tile_b::BYTES_PER_ROW / sizeof(typename Traits::B_type);
+      // The number of MMAS to perform before incrementing by the group stride.
+      constexpr int MMAS_K_PER_GROUP = ELTS_PER_KGROUP / Traits::GMMA_K;
+      // The number of MMAS a k-warp performs.
+      constexpr int MMAS_K_PER_WARP = Mma_tile::MMAS_K;
+
+      int const group_offset = warp_k * MMAS_K_PER_WARP;
+      // Initialize the descriptor
+      int gi = group_offset / MMAS_K_PER_GROUP;
+      int ii = group_offset % MMAS_K_PER_GROUP;
+
+      int BYTES_OFFSET_NO_4LSB = gi * Smem_tile_b::BYTES_PER_BUFFER_NO_4LSB +
+                                 ii * Smem_tile_b::Gmma_descriptor::BYTES_PER_DESC_NO_4LSB;
+
+      uint64_t desc_b = gmma_desc_b_[0].get_descriptor(0);
+      int2& desc_b_view = reinterpret_cast<int2&>(desc_b);
+      desc_b_view.x += BYTES_OFFSET_NO_4LSB;
+
+      typename Smem_tile_b::Gmma_descriptor::Single_desc single_desc_b;
+      single_desc_b.set(desc_b);
+#pragma unroll
+      for (int ki = 0; ki < MMAS_K_PER_WARP - 1; ki++) {
+        acc_[0][0].template mma<false>(frag_a[ki][0], single_desc_b);
+
+        // Increment the descriptor for the next kblock.
+        int const ki_next = group_offset + ki + 1;
+        // Update descriptor for next GMMA.
+        if (ki_next % MMAS_K_PER_GROUP == 0) {
+          desc_b_view.x += Smem_tile_b::BYTES_PER_BUFFER_NO_4LSB -
+                           Smem_tile_b::Gmma_descriptor::BYTES_DESC_INC_BOUNDARY_NO_4LSB;
+        } else {
+          desc_b_view.x += Smem_tile_b::Gmma_descriptor::BYTES_PER_DESC_NO_4LSB;
+        }
+        single_desc_b.set(desc_b);
+      }
+      // Last one increments gsb.
+      acc_[0][0].template mma<true>(frag_a[MMAS_K_PER_WARP - 1][0], single_desc_b);
+    } else {  // GMMA supports transposed input: we can just advance SMEM address to the k-th block
+              // for each Warp in K.
+
+      constexpr int NUM_KGROUPS = Smem_tile_b::BUFFERS_PER_TILE;
+      constexpr int MMAS_K_PER_GROUP = Mma_tile::MMAS_K / NUM_KGROUPS;
+      static_assert(MMAS_K_PER_GROUP * NUM_KGROUPS == Mma_tile::MMAS_K);
+
+      uint64_t temp_desc = gmma_desc_b_[0].get_descriptor(0);
+      int2& tmp = reinterpret_cast<int2&>(temp_desc);
+
+      constexpr int BYTES_PER_K_GROUP_NO_4LSB =
+          Mma_tile::K_PER_WARP_GROUP * Mma_tile::N_PER_WARP_GROUP * sizeof(Traits::B_type) / 16;
+      tmp.x += warp_k * BYTES_PER_K_GROUP_NO_4LSB;
+      gmma_desc_b_[0].set_descriptor(0, temp_desc);
+
+#pragma unroll
+      for (int kbi = 0; kbi < NUM_KGROUPS - 1; kbi++) {
+#pragma unroll
+        for (int ki = 0; ki < MMAS_K_PER_GROUP; ki++) {
+          fill_frag_a(frag_a[kbi * MMAS_K_PER_GROUP + ki][0]);
+          // Never increment scoreboard, but check for last kblock.
+          compute(ki, false, ki == MMAS_K_PER_GROUP - 1);
+        }
+        increment_gmma_desc_group();
+      }
+
+#pragma unroll
+      for (int ki = 0; ki < MMAS_K_PER_GROUP - 1; ki++) {
+        fill_frag_a(frag_a[(NUM_KGROUPS - 1) * MMAS_K_PER_GROUP + ki][0]);
+        compute(ki);
+      }
+
+      fill_frag_a(frag_a[NUM_KGROUPS * MMAS_K_PER_GROUP - 1][0]);
+      compute(NUM_KGROUPS * MMAS_K_PER_GROUP - 1, true, true);
+    }
+  }
+
+  // Fill the input fragment
+  inline __device__ void fill_frag_a(Fragment a_temp) {
+#pragma unroll
+    for (int idx = 0; idx < Fragment::NUM_REGS; ++idx) {
+      a_[0].reg(idx) = a_temp.reg(idx);
+    }
+  }
+
+  // Load from shared memory.
+  // we don't actually need this with MHA fused kernel.
+  inline __device__ void load(Smem_tile_a& smem_a, Smem_tile_b& smem_b, int ki) {
+    // smem_a.load( a_[ki], ki );
+  }
+
+  // The accumulators.
+  Fragment_accumulator<Traits> acc_[MMAS_M][MMAS_N];
+
+  // The fragments to load A.
+  // Need to think about is is better to declare as Fragment a_?
+  // for the second GEMM, MMAS_M is most likely 1. (at least for now. )
+  Fragment a_[MMAS_M];
+
+  // one descriptor group per stage, different GMMAs may or maynot share descriptor group
+  // each descriptor group holds all the descriptors for the entire kblock
+
+  // The descriptor to load B.
+  typename Smem_tile_b::Gmma_descriptor gmma_desc_b_[MMAS_N];
+  uint32_t b_smem_base_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/hopper/fragment.h b/csrc/fmha_v2/fmha/hopper/fragment.h
new file mode 100644
index 0000000000..0ee3c7e5be
--- /dev/null
+++ b/csrc/fmha_v2/fmha/hopper/fragment.h
@@ -0,0 +1,491 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/fragment.h>
+#include <fmha/hopper/gmma_descriptor.h>
+#include <fmha/traits.h>
+#include <fmha/utils.h>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// F R A G M E N T  (A)
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Only needed if Operand A is coming from RF.
+template <int M, int N, int K, bool A_RF, bool B_RF, typename Layout>
+struct Fragment_a<Hopper_hgmma_fp16_traits<M, N, K, A_RF, B_RF>, Layout>
+    : public Fragment<uint16_t,
+                      (M * K) / (Hopper::WARPS_PER_WARP_GROUP * Hopper::THREADS_PER_WARP)> {
+  // A should be coming from RF.
+  static_assert(A_RF, "A_RF must be true to allocate RF for Operand A.\n");
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Only needed if Operand A is coming from RF.
+template <int M, int N, int K, bool A_RF, bool B_RF, typename Layout>
+struct Fragment_a<Hopper_hgmma_bf16_traits<M, N, K, A_RF, B_RF>, Layout>
+    : public Fragment<uint16_t,
+                      (M * K) / (Hopper::WARPS_PER_WARP_GROUP * Hopper::THREADS_PER_WARP)> {
+  // A should be coming from RF.
+  static_assert(A_RF, "A_RF must be true to allocate RF for Operand A.\n");
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Only needed if Operand A is coming from RF.
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Layout>
+struct Fragment_a<Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Layout>
+    : public Fragment<uint16_t, (GMMA_M * GMMA_K) /
+                                    (Hopper::WARPS_PER_WARP_GROUP * Hopper::THREADS_PER_WARP)> {
+  // A should be coming from RF.
+  static_assert(GMMA_A_RF == true, "GMMA_A_RF must be true to allocate RF for Operand A.\n");
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Only needed if Operand A is coming from RF.
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Layout>
+struct Fragment_a<Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+                  Layout>
+    : public Fragment<int8_t, (GMMA_M * GMMA_K) /
+                                  (Hopper::WARPS_PER_WARP_GROUP * Hopper::THREADS_PER_WARP)> {
+  // A should be coming from RF.
+  static_assert(GMMA_A_RF == true, "GMMA_A_RF must be true to allocate RF for Operand A.\n");
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, typename Input_type_A, typename Input_type_B,
+          typename Output_type, typename Layout>
+struct Fragment_a<Hopper_qgmma_fp8_fp32_traits<GMMA_M, GMMA_N, GMMA_K, true, false, Input_type_A,
+                                               Input_type_B, Output_type>,
+                  Layout>
+    // TODO: Do we need the * 4 or not?
+    : public Fragment<Input_type_A, (GMMA_M * GMMA_K) /
+                                        (Hopper::WARPS_PER_WARP_GROUP * Hopper::THREADS_PER_WARP)> {
+  static_assert(sizeof(Input_type_A) == 1);
+  static_assert(sizeof(Input_type_B) == 1);
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// H G M M A . F 1 6
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// both operands are coming from SMEM
+
+template <int GMMA_M, int GMMA_N, int GMMA_K>
+struct Fragment_accumulator<Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, false, false>>
+    : public Fragment<uint16_t, (GMMA_M * GMMA_N) /
+                                    (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)> {
+  // The base class.
+  using Base = Fragment<uint16_t, (GMMA_M * GMMA_N) /
+                                      (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)>;
+
+  // Add two fragments.
+  template <typename Other_fragment_>
+  inline __device__ void add(Other_fragment_ const& other) {
+    for (int ii = 0; ii < Base::NUM_REGS; ++ii) {
+      this->reg(ii) = hadd2(this->reg(ii), other.reg(ii));
+    }
+  }
+
+  // Do the GMMA.
+  template <bool INCREMENT_SCORE_BOARD, typename Gmma_single_desc_a, typename Gmma_single_desc_b>
+  inline __device__ void mma(Gmma_single_desc_a const& single_desc_a,
+                             Gmma_single_desc_b const& single_desc_b) {
+    // call hgmma
+    fmha::hgmma_fp16<
+        Gmma_single_desc_a::TRANS_MODE == fmha::Gmma_descriptor_transpose::TRANS ? true : false,
+        Gmma_single_desc_b::TRANS_MODE == fmha::Gmma_descriptor_transpose::TRANS ? true : false,
+        GMMA_N, INCREMENT_SCORE_BOARD>(single_desc_a.get(), single_desc_b.get(), this->regs_);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// both operands are coming from SMEM
+
+template <int GMMA_M, int GMMA_N, int GMMA_K>
+struct Fragment_accumulator<Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, false, false>>
+    : public Fragment<float, (GMMA_M * GMMA_N) /
+                                 (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)> {
+  // The base class.
+  using Base = Fragment<float, (GMMA_M * GMMA_N) /
+                                   (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)>;
+
+  // Add two fragments.
+  template <typename Other_fragment_>
+  inline __device__ void add(Other_fragment_ const& other) {
+    for (int ii = 0; ii < Base::NUM_REGS; ++ii) {
+      this->elt(ii) = this->elt(ii) + other.elt(ii);
+    }
+  }
+
+  // Do the GMMA.
+  template <bool INCREMENT_SCORE_BOARD, typename Gmma_single_desc_a, typename Gmma_single_desc_b>
+  inline __device__ void mma(Gmma_single_desc_a const& single_desc_a,
+                             Gmma_single_desc_b const& single_desc_b) {
+    // call hgmma
+    fmha::hgmma_bf16<
+        Gmma_single_desc_a::TRANS_MODE == fmha::Gmma_descriptor_transpose::TRANS ? true : false,
+        Gmma_single_desc_b::TRANS_MODE == fmha::Gmma_descriptor_transpose::TRANS ? true : false,
+        GMMA_N, INCREMENT_SCORE_BOARD>(single_desc_a.get(), single_desc_b.get(), this->regs_);
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// A is coming from RF; B is coming from SMEM
+
+template <int GMMA_M, int GMMA_N, int GMMA_K>
+struct Fragment_accumulator<Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, true, false>>
+    : public Fragment<uint16_t, (GMMA_M * GMMA_N) /
+                                    (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)> {
+  // The base class.
+  using Base = Fragment<uint16_t, (GMMA_M * GMMA_N) /
+                                      (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)>;
+
+  // The Traits
+  using Traits = Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, true, false>;
+
+  // Add two fragments.
+  template <typename Other_fragment_>
+  inline __device__ void add(Other_fragment_ const& other) {
+    for (int ii = 0; ii < Base::NUM_REGS; ++ii) {
+      this->reg(ii) = hadd2(this->reg(ii), other.reg(ii));
+    }
+  }
+
+  // Do the GMMA.
+  template <bool INCREMENT_SCORE_BOARD, typename Layout_a, typename Gmma_single_desc_b>
+  inline __device__ void mma(Fragment_a<Traits, Layout_a> const& a,
+                             Gmma_single_desc_b const& single_desc_b) {
+    // call hgmma
+    fmha::hgmma_rfa_fp16<
+        Gmma_single_desc_b::TRANS_MODE == fmha::Gmma_descriptor_transpose::TRANS ? true : false,
+        GMMA_N, INCREMENT_SCORE_BOARD>(a.regs_, single_desc_b.get(), this->regs_);
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// A is coming from RF; B is coming from SMEM
+
+template <int GMMA_M, int GMMA_N, int GMMA_K>
+struct Fragment_accumulator<Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, true, false>>
+    : public Fragment<float, (GMMA_M * GMMA_N) /
+                                 (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)> {
+  // The base class.
+  using Base = Fragment<float, (GMMA_M * GMMA_N) /
+                                   (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)>;
+
+  // The Traits
+  using Traits = Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, true, false>;
+
+  // Add two fragments.
+  template <typename Other_fragment_>
+  inline __device__ void add(Other_fragment_ const& other) {
+    for (int ii = 0; ii < Base::NUM_ELTS; ++ii) {
+      this->elt(ii) = this->elt(ii) + other.elt(ii);
+    }
+  }
+
+  // Do the GMMA.
+  template <bool INCREMENT_SCORE_BOARD, typename Layout_a, typename Gmma_single_desc_b>
+  inline __device__ void mma(Fragment_a<Traits, Layout_a> const& a,
+                             Gmma_single_desc_b const& single_desc_b) {
+    // call hgmma
+    fmha::hgmma_rfa_bf16<
+        Gmma_single_desc_b::TRANS_MODE == fmha::Gmma_descriptor_transpose::TRANS ? true : false,
+        GMMA_N, INCREMENT_SCORE_BOARD>(a.regs_, single_desc_b.get(), this->regs_);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// H G M M A . F 3 2
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////
+// both operands are coming from SMEM
+template <int GMMA_M, int GMMA_N, int GMMA_K>
+struct Fragment_accumulator<Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, false, false>>
+    : public Fragment<float, (GMMA_M * GMMA_N) /
+                                 (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)> {
+  // The base class.
+  using Base = Fragment<float, (GMMA_M * GMMA_N) /
+                                   (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)>;
+
+  // Add two fragments.
+  template <typename Other_fragment_>
+  inline __device__ void add(Other_fragment_ const& other) {
+    for (int ii = 0; ii < Base::NUM_ELTS; ++ii) {
+      this->elt(ii) = this->elt(ii) + other.elt(ii);
+    }
+  }
+
+  // Do the GMMA.
+  template <bool INCREMENT_SCORE_BOARD, typename Gmma_single_desc_a, typename Gmma_single_desc_b>
+  inline __device__ void mma(Gmma_single_desc_a const& single_desc_a,
+                             Gmma_single_desc_b const& single_desc_b) {
+    // call hgmma
+    fmha::hgmma_fp32<
+        Gmma_single_desc_a::TRANS_MODE == fmha::Gmma_descriptor_transpose::TRANS ? true : false,
+        Gmma_single_desc_b::TRANS_MODE == fmha::Gmma_descriptor_transpose::TRANS ? true : false,
+        GMMA_N, INCREMENT_SCORE_BOARD>(single_desc_a.get(), single_desc_b.get(), this->regs_);
+  }
+};
+
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// A is coming from RF; B is coming from SMEM
+template <int GMMA_M, int GMMA_N, int GMMA_K>
+struct Fragment_accumulator<Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, true, false>>
+    : public Fragment<float, (GMMA_M * GMMA_N) /
+                                 (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)> {
+  // The base class.
+  using Base = Fragment<float, (GMMA_M * GMMA_N) /
+                                   (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)>;
+
+  // The Traits
+  using Traits = Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, true, false>;
+
+  // Add two fragments.
+  template <typename Other_fragment_>
+  inline __device__ void add(Other_fragment_ const& other) {
+    for (int ii = 0; ii < Base::NUM_ELTS; ++ii) {
+      this->elt(ii) = this->elt(ii) + other.elt(ii);
+    }
+  }
+
+  // Do the GMMA.
+  template <bool INCREMENT_SCORE_BOARD, typename Layout_a, typename Gmma_single_desc_b>
+  inline __device__ void mma(Fragment_a<Traits, Layout_a> const& a,
+                             Gmma_single_desc_b const& single_desc_b) {
+    // call hgmma
+    fmha::hgmma_rfa_fp32<
+        Gmma_single_desc_b::TRANS_MODE == fmha::Gmma_descriptor_transpose::TRANS ? true : false,
+        GMMA_N, INCREMENT_SCORE_BOARD>(a.regs_, single_desc_b.get(), this->regs_);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Q G M M A . F 3 2
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// I G M M A . I N T 8
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Both operands are coming from SMEM.
+template <int GMMA_M, int GMMA_N, int GMMA_K>
+struct Fragment_accumulator<Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, false, false>>
+    : public Fragment<int32_t, (GMMA_M * GMMA_N) /
+                                   (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)> {
+  // The base class.
+  using Base = Fragment<int32_t, (GMMA_M * GMMA_N) /
+                                     (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)>;
+
+  // Do the GMMA.
+  template <bool INCREMENT_SCORE_BOARD, typename Gmma_single_desc_a, typename Gmma_single_desc_b>
+  inline __device__ void mma(Gmma_single_desc_a const& single_desc_a,
+                             Gmma_single_desc_b const& single_desc_b) {
+    fmha::igmma_int8_int32<GMMA_N, INCREMENT_SCORE_BOARD>(single_desc_a.get(), single_desc_b.get(),
+                                                          this->regs_);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// A is coming from RF; B is coming from SMEM
+
+template <int GMMA_M, int GMMA_N, int GMMA_K>
+struct Fragment_accumulator<Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, true, false>>
+    : public Fragment<int32_t, (GMMA_M * GMMA_N) /
+                                   (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)> {
+  // The base class.
+  using Base = Fragment<int32_t, (GMMA_M * GMMA_N) /
+                                     (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)>;
+
+  // The Traits.
+  using Traits = Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, true, false>;
+
+  // Do the GMMA.
+  template <bool INCREMENT_SCORE_BOARD, typename Layout_a, typename Gmma_single_desc_b>
+  inline __device__ void mma(Fragment_a<Traits, Layout_a> const& a,
+                             Gmma_single_desc_b const& single_desc_b) {
+    fmha::igmma_rfa_int8_int32<GMMA_N, INCREMENT_SCORE_BOARD>(a.regs_, single_desc_b.get(),
+                                                              this->regs_);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Fp32 Accumulator A operand from RF and B operand from SMEM
+template <int GMMA_M, int GMMA_N, int GMMA_K, typename Input_type_A, typename Input_type_B,
+          typename Output_type>
+struct Fragment_accumulator<Hopper_qgmma_fp8_fp32_traits<GMMA_M, GMMA_N, GMMA_K, true, false,
+                                                         Input_type_A, Input_type_B, Output_type>>
+    : public Fragment<float, (GMMA_M * GMMA_N) /
+                                 (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)> {
+  // The base class.
+  using Base = Fragment<float, (GMMA_M * GMMA_N) /
+                                   (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)>;
+
+  // Add two fragments.
+  template <typename Other_fragment_>
+  inline __device__ void add(Other_fragment_ const& other) {
+    for (int ii = 0; ii < Base::NUM_ELTS; ++ii) {
+      this->elt(ii) = this->elt(ii) + other.elt(ii);
+    }
+  }
+
+  // The Traits
+  using Traits = Hopper_qgmma_fp8_fp32_traits<GMMA_M, GMMA_N, GMMA_K, true, false, Input_type_A,
+                                              Input_type_B, Output_type>;
+
+  // Do the GMMA.
+  template <bool INCREMENT_SCORE_BOARD, typename Layout_a, typename Gmma_single_desc_b>
+  inline __device__ void mma(Fragment_a<Traits, Layout_a> const& a,
+                             Gmma_single_desc_b const& single_desc_b) {
+    // call hgmma
+    if (std::is_same_v<Input_type_A, e4m3_t> && std::is_same_v<Input_type_B, e4m3_t>) {
+      qgmma_rfa_e4m3_e4m3_fp32<GMMA_N, INCREMENT_SCORE_BOARD>(a.regs_, single_desc_b.get(),
+                                                              this->regs_);
+    } else if (std::is_same_v<Input_type_A, e5m2_t> && std::is_same_v<Input_type_B, e4m3_t>) {
+      qgmma_rfa_e5m2_e4m3_fp32<GMMA_N, INCREMENT_SCORE_BOARD>(a.regs_, single_desc_b.get(),
+                                                              this->regs_);
+    } else if (std::is_same_v<Input_type_A, e4m3_t> && std::is_same_v<Input_type_B, e5m2_t>) {
+      qgmma_rfa_e4m3_e5m2_fp32<GMMA_N, INCREMENT_SCORE_BOARD>(a.regs_, single_desc_b.get(),
+                                                              this->regs_);
+    } else if (std::is_same_v<Input_type_A, e5m2_t> && std::is_same_v<Input_type_B, e5m2_t>) {
+      qgmma_rfa_e5m2_e5m2_fp32<GMMA_N, INCREMENT_SCORE_BOARD>(a.regs_, single_desc_b.get(),
+                                                              this->regs_);
+    } else {
+      assert(false && "unsupported");
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// fp32 accumulator
+// Both operands are coming from SMEM.
+template <int GMMA_M, int GMMA_N, int GMMA_K, typename Input_type_A, typename Input_type_B,
+          typename Output_type>
+struct Fragment_accumulator<Hopper_qgmma_fp8_fp32_traits<GMMA_M, GMMA_N, GMMA_K, false, false,
+                                                         Input_type_A, Input_type_B, Output_type>>
+    : public Fragment<float, (GMMA_M * GMMA_N) /
+                                 (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)> {
+  // The base class.
+  using Base = Fragment<float, (GMMA_M * GMMA_N) /
+                                   (Hopper::THREADS_PER_WARP * Hopper::WARPS_PER_WARP_GROUP)>;
+
+  // Do the GMMA.
+  template <bool INCREMENT_SCORE_BOARD, typename Gmma_single_desc_a, typename Gmma_single_desc_b>
+  inline __device__ void mma(Gmma_single_desc_a const& single_desc_a,
+                             Gmma_single_desc_b const& single_desc_b) {
+    if (std::is_same_v<Input_type_A, e4m3_t> && std::is_same_v<Input_type_B, e4m3_t>) {
+      qgmma_e4m3_e4m3_fp32<GMMA_N, INCREMENT_SCORE_BOARD>(single_desc_a.get(), single_desc_b.get(),
+                                                          this->regs_);
+    } else if (std::is_same_v<Input_type_A, e5m2_t> && std::is_same_v<Input_type_B, e4m3_t>) {
+      qgmma_e5m2_e4m3_fp32<GMMA_N, INCREMENT_SCORE_BOARD>(single_desc_a.get(), single_desc_b.get(),
+                                                          this->regs_);
+    } else if (std::is_same_v<Input_type_A, e4m3_t> && std::is_same_v<Input_type_B, e5m2_t>) {
+      qgmma_e4m3_e5m2_fp32<GMMA_N, INCREMENT_SCORE_BOARD>(single_desc_a.get(), single_desc_b.get(),
+                                                          this->regs_);
+    } else if (std::is_same_v<Input_type_A, e5m2_t> && std::is_same_v<Input_type_B, e5m2_t>) {
+      qgmma_e5m2_e5m2_fp32<GMMA_N, INCREMENT_SCORE_BOARD>(single_desc_a.get(), single_desc_b.get(),
+                                                          this->regs_);
+    } else {
+      assert(false && "unsupported");
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Mma_tile>
+struct Softmax_saver_tma {
+  // Warps.
+  enum { WARPS_M = Cta_tile::WARPS_M };
+
+  enum { WARPS_N = Cta_tile::WARPS_N };
+
+  enum { WARPS_K = Cta_tile::WARPS_K };
+
+  // Ctor.
+  template <typename Params, typename Head_info>
+  inline __device__ Softmax_saver_tma(Params const& params, Head_info const& head_info)
+      : actual_len_(head_info.actual_seqlen),
+        local_q_tile_offset_(head_info.local_q_tile_offset),
+        softmax_sum_ptr_(reinterpret_cast<char*>(params.softmax_stats_ptr)),
+        softmax_stats_stride_in_bytes_(params.softmax_stats_stride_in_bytes) {
+    softmax_max_ptr_ = reinterpret_cast<char*>(params.softmax_stats_ptr);
+    int warp = (threadIdx.x % 128) / Cta_tile::THREADS_PER_WARP;
+    int lane = threadIdx.x % Cta_tile::THREADS_PER_WARP;
+    // MMA row0 index (8x4 thread layout)
+    row0_ = warp * Mma_tile::M_PER_MMA / WARPS_M + (lane / 4);
+
+    int sum_s =
+        params.is_s_padded ? params.s * head_info.bidb : params.cu_q_seqlens[head_info.bidb];
+    int token_id = sum_s * params.h + head_info.bidh;
+    size_t const bh_offset =
+        token_id * sizeof(float) * 2 + local_q_tile_offset_ * softmax_stats_stride_in_bytes_;
+    softmax_max_ptr_ += bh_offset + row0_ * softmax_stats_stride_in_bytes_;
+    softmax_sum_ptr_ += bh_offset + row0_ * softmax_stats_stride_in_bytes_ + sizeof(float);
+  };
+
+  inline __device__ void store(float* p_sum, float* p_max, float sqrt_d, int row_offset,
+                               bool valid_run) {
+    // Four threads process two rows in mma, each row has one softmax_sum and one softmax_max.
+    // Here we use one thread to write one softmax element.
+    float values;
+    int lane = threadIdx.x % Cta_tile::THREADS_PER_WARP;
+    if (lane % 4 < 2) {
+      values = p_sum[lane % 2];
+    } else {
+      values = p_max[lane % 2] / sqrt_d;
+    }
+    if (!valid_run && (lane % 4) < 2) {
+      values = 1.0;
+    }
+    char* dst_ptr = (lane % 4 < 2) ? softmax_sum_ptr_ : softmax_max_ptr_;
+    size_t off_inside_mma = (lane % 2 == 0) ? row_offset : row_offset + 8;
+    if (local_q_tile_offset_ + row0_ + off_inside_mma < actual_len_) {
+      fmha::stg(dst_ptr + off_inside_mma * softmax_stats_stride_in_bytes_, values);
+    }
+  }
+
+  // ptr
+  char* softmax_sum_ptr_ = nullptr;
+  char* softmax_max_ptr_ = nullptr;
+
+  // the first row's idx
+  int row0_;
+  // actual seq length
+  int const actual_len_;
+  int const softmax_stats_stride_in_bytes_;
+  int const local_q_tile_offset_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/hopper/gmem_tile_o_packed.h b/csrc/fmha_v2/fmha/hopper/gmem_tile_o_packed.h
new file mode 100644
index 0000000000..7c9ac43bb8
--- /dev/null
+++ b/csrc/fmha_v2/fmha/hopper/gmem_tile_o_packed.h
@@ -0,0 +1,1138 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+#include <fmha/gmem_tile_o_packed.h>
+#include <fmha/hopper/fragment.h>
+
+namespace fmha {
+
+namespace v2 {
+
+template <typename Traits, typename Cta_tile, int WARPS_K>
+struct Gmem_tile_o_hopper {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Not super proud of this. Need to refactor.
+// A not optimized way of storing tile_O, without SMEM swizzle.
+// STG.32 is going to be used.
+template <typename Traits, typename Cta_tile>
+struct Gmem_tile_o_hopper_16bits {
+  // The associated MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of elements per STG.
+  enum { ELEMENTS_PER_STG = 2 };
+
+  // The size in bytes of each element.
+  enum { BYTES_PER_ELEMENT = 2 };
+
+  // The size of each STG.
+  enum { BYTES_PER_STG = ELEMENTS_PER_STG * BYTES_PER_ELEMENT };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = Cta_tile::VALID_N * BYTES_PER_ELEMENT };
+
+  // The number of rows accessed by each thread.
+  enum { ROWS_PER_THREAD = Mma_tile::M_PER_MMA / 8 / Cta_tile::WARPS_PER_CTA };
+
+  enum { ROWS = Cta_tile::M };
+
+  // The number of columns access by each thread.
+  // Note there are 2 elements per reg.
+  enum { COLS_PER_THREAD = Mma_tile::N_PER_MMA / 4 / 2 };
+
+  // The number of valid columns (stored to GMEM) by each thread.
+  enum {
+    VALID_COLS_PER_THREAD_FOR_LAST_MMA = (Cta_tile::VALID_N % Mma_tile::N_PER_MMA) == 0
+                                             ? COLS_PER_THREAD
+                                             : (Cta_tile::VALID_N % Mma_tile::N_PER_MMA) / 8
+  };
+
+  enum { VALID_MMAS_N = fmha::Div_up<Cta_tile::VALID_N, Mma_tile::N_PER_MMA>::VALUE };
+
+  static_assert(Cta_tile::VALID_N % 8 == 0, "The valid head dimension needs to be multiple of 8.");
+
+  // The number of accumulator held by each thread, per HGMMA instruction.
+  enum { ELTS_PER_THREAD = ROWS_PER_THREAD * COLS_PER_THREAD };
+
+  // Currently, we assume for o matrix, GMMA M/N shape matches CTA M/N shape.
+  static_assert(Mma_tile::M_PER_MMA == Cta_tile::M &&
+                    Mma_tile::N_PER_MMA * Mma_tile::MMAS_N == Cta_tile::N,
+                "Currently, we assume for o matrix, GMMA M shape matches CTA M shape. ");
+
+  // Step N for one quad
+  enum { STEP_N = 8 * BYTES_PER_ELEMENT };
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o_hopper_16bits(Params const& params, Block_info const& block_info,
+                                              int tidx, int cta_row_offset = 0)
+      : params_o_stride_in_bytes_(params.o_stride_in_bytes),
+        actual_seqlen_(block_info.actual_seqlen),
+        o_ptr_(reinterpret_cast<char*>(params.o_ptr)) {
+    // Decompose the position of the thread into warp/lane.
+    int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+    // int warpgroup_idx = warp / 4;
+    int warp_idx_within_warpgroup = warp % 4;
+
+    // Compute the position in the sequence (within the CTA for the moment).
+    int row = warp_idx_within_warpgroup * (Mma_tile::M_PER_MMA / 4) + lane / 4;
+    // Store the row to update the predicates in load.
+    row_ = cta_row_offset + row;
+    // Compute the position of the thread in the row.
+    int col = lane % 4 * ELEMENTS_PER_STG;
+
+    // The offset of the 1st row written by the thread. We store the P matrix interleaved.
+    int64_t row_offset =
+        (int64_t)row_ * params_o_stride_in_bytes_ + block_info.bidx * BYTES_PER_ROW;
+    // Finalize the pointer.
+    o_ptr_ += row_offset + col * BYTES_PER_ELEMENT;
+  }
+
+  // Store data to memory.
+  template <typename Accumulators, int M, int N>
+  inline __device__ void store(Accumulators const (&acc)[M][N]) {
+    int64_t const step_m = 8 * (this->params_o_stride_in_bytes_);
+    // we assume M = 1. some shortcuts.
+    static_assert(M == 1);
+#pragma unroll
+    for (int row_idx = 0; row_idx < ROWS_PER_THREAD; ++row_idx) {
+      if (row_ + row_idx * 8 >= actual_seqlen_) {
+        break;
+      }
+#pragma unroll
+      for (int mma_ni = 0; mma_ni < VALID_MMAS_N - 1; ++mma_ni) {
+#pragma unroll
+        for (int col_idx = 0; col_idx < COLS_PER_THREAD; ++col_idx) {
+          uint32_t acc_0 = acc[0][mma_ni].reg(col_idx * ROWS_PER_THREAD + row_idx);
+
+          int64_t offset =
+              (int64_t)row_idx * step_m + (int64_t)(col_idx + mma_ni * COLS_PER_THREAD) * STEP_N;
+          fmha::stg(o_ptr_ + offset, acc_0);
+        }  // col_idx
+      }  // mma_ni
+
+      // The last mma_n may not store full elements back to GMEM.
+      int mma_ni = VALID_MMAS_N - 1;
+#pragma unroll
+      for (int col_idx = 0; col_idx < VALID_COLS_PER_THREAD_FOR_LAST_MMA; ++col_idx) {
+        uint32_t acc_0 = acc[0][mma_ni].reg(col_idx * ROWS_PER_THREAD + row_idx);
+
+        int64_t offset =
+            (int64_t)row_idx * step_m + (int64_t)(col_idx + mma_ni * COLS_PER_THREAD) * STEP_N;
+        fmha::stg(o_ptr_ + offset, acc_0);
+      }  // col_idx
+    }  // row_idx
+  }
+
+  // Move to the next location.
+  inline __device__ void move() {
+    row_ += ROWS;
+    o_ptr_ += (int64_t)ROWS * params_o_stride_in_bytes_;
+  }
+
+  // The stride between rows for the QKV matrice.
+  int64_t params_o_stride_in_bytes_;
+  // The pointer.
+  char* o_ptr_;
+  // Is the thread active for the last STG?
+  int is_active_for_last_stg_;
+
+  // The row loaded by this thread.
+  int row_;
+  // The length of the sequence loaded by that CTA.
+  int actual_seqlen_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Cta_tile>
+struct Gmem_tile_o_hopper<
+    fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+    1>  // WARPS_K
+    : public Gmem_tile_o_hopper_16bits<
+          fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile> {
+  using Traits = fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+
+  using Base = Gmem_tile_o_hopper_16bits<
+      fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile>;
+
+  template <typename Params, typename Block_info, typename Shared>
+  inline __device__ Gmem_tile_o_hopper(Params const& params, Block_info const& block_info, Shared&&,
+                                       int tidx, int cta_row_offset = 0)
+      : Base(params, block_info, tidx, cta_row_offset) {
+    static_assert(!std::is_same<Shared, int>::value, "Check constructor argument type!");
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Cta_tile>
+struct Gmem_tile_o_hopper<
+    fmha::Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+    1>  // WARPS_K
+    : public Gmem_tile_o_hopper_16bits<
+          fmha::Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile> {
+  using Traits = fmha::Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+
+  using Base = Gmem_tile_o_hopper_16bits<
+      fmha::Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile>;
+
+  using Mma_tile = typename Base::Mma_tile;
+
+  template <typename Params, typename Block_info, typename Shared>
+  inline __device__ Gmem_tile_o_hopper(Params const& params, Block_info const& block_info, Shared&&,
+                                       int tidx, int cta_row_offset = 0)
+      : Base(params, block_info, tidx, cta_row_offset) {
+    static_assert(!std::is_same<Shared, int>::value, "Check constructor argument type!");
+  }
+
+  // Store data to memory.
+  template <typename Accumulators, int M, int N>
+  inline __device__ void store(Accumulators const (&acc)[M][N]) {
+    int64_t const step_m = 8 * (this->params_o_stride_in_bytes_);
+    // we assume M = 1. some shortcuts.
+    static_assert(M == 1);
+#pragma unroll
+    for (int row_idx = 0; row_idx < Base::ROWS_PER_THREAD; ++row_idx) {
+      if (this->row_ + row_idx * 8 >= this->actual_seqlen_) {
+        break;
+      }
+#pragma unroll
+      for (int mma_ni = 0; mma_ni < Base::VALID_MMAS_N - 1; ++mma_ni) {
+#pragma unroll
+        for (int col_idx = 0; col_idx < Base::COLS_PER_THREAD; ++col_idx) {
+          // 2 denotes as fp32 --> fp16
+          float reg0 = acc[0][mma_ni].elt(2 * (col_idx * Base::ROWS_PER_THREAD + row_idx));
+          float reg1 = acc[0][mma_ni].elt(2 * (col_idx * Base::ROWS_PER_THREAD + row_idx) + 1);
+          uint32_t out = fmha::float2_to_half2(reg0, reg1);
+
+          int64_t offset = (int64_t)row_idx * step_m +
+                           (int64_t)(col_idx + mma_ni * Base::COLS_PER_THREAD) * Base::STEP_N;
+          fmha::stg(this->o_ptr_ + offset, out);
+        }  // col_idx
+      }  // mma_ni
+
+      // The last mma_n may not store full elements back to GMEM.
+      int mma_ni = Base::VALID_MMAS_N - 1;
+#pragma unroll
+      for (int col_idx = 0; col_idx < Base::VALID_COLS_PER_THREAD_FOR_LAST_MMA; ++col_idx) {
+        // 2 denotes as fp32 --> fp16
+        float reg0 = acc[0][mma_ni].elt(2 * (col_idx * Base::ROWS_PER_THREAD + row_idx));
+        float reg1 = acc[0][mma_ni].elt(2 * (col_idx * Base::ROWS_PER_THREAD + row_idx) + 1);
+        uint32_t out = fmha::float2_to_half2(reg0, reg1);
+
+        int64_t offset = (int64_t)row_idx * step_m +
+                         (int64_t)(col_idx + mma_ni * Base::COLS_PER_THREAD) * Base::STEP_N;
+        fmha::stg(this->o_ptr_ + offset, out);
+      }  // col_idx
+    }  // row_idx
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Cta_tile>
+struct Gmem_tile_o_hopper<
+    fmha::Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+    1>  // WARPS_K
+    : public Gmem_tile_o_hopper_16bits<
+          fmha::Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile> {
+  using Traits = fmha::Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+
+  using Base = Gmem_tile_o_hopper_16bits<
+      fmha::Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile>;
+
+  using Mma_tile = typename Base::Mma_tile;
+
+  template <typename Params, typename Block_info, typename Shared>
+  inline __device__ Gmem_tile_o_hopper(Params const& params, Block_info const& block_info, Shared&&,
+                                       int tidx, int cta_row_offset = 0)
+      : Base(params, block_info, tidx, cta_row_offset) {
+    static_assert(!std::is_same<Shared, int>::value, "Check constructor argument type!");
+  }
+
+  // Store data to memory.
+  template <typename Accumulators, int M, int N>
+  inline __device__ void store(Accumulators const (&acc)[M][N]) {
+    int64_t const step_m = 8 * (this->params_o_stride_in_bytes_);
+    // we assume M = 1. some shortcuts.
+    static_assert(M == 1);
+#pragma unroll
+    for (int row_idx = 0; row_idx < Base::ROWS_PER_THREAD; ++row_idx) {
+      if (this->row_ + row_idx * 8 >= this->actual_seqlen_) {
+        break;
+      }
+#pragma unroll
+      for (int mma_ni = 0; mma_ni < Mma_tile::VALID_MMAS_N - 1; ++mma_ni) {
+#pragma unroll
+        for (int col_idx = 0; col_idx < Base::COLS_PER_THREAD; ++col_idx) {
+          // 2 denotes as fp32 --> bf16
+          float reg0 = acc[0][mma_ni].elt(2 * (col_idx * Base::ROWS_PER_THREAD + row_idx));
+          float reg1 = acc[0][mma_ni].elt(2 * (col_idx * Base::ROWS_PER_THREAD + row_idx) + 1);
+          uint32_t out = fmha::float2_to_bf16_x2(reg0, reg1);
+
+          int64_t offset = (int64_t)row_idx * step_m +
+                           (int64_t)(col_idx + mma_ni * Base::COLS_PER_THREAD) * Base::STEP_N;
+          fmha::stg(this->o_ptr_ + offset, out);
+        }  // row_idx
+      }  // col_idx
+
+      // The last mma_n may not store full elements back to GMEM.
+      int mma_ni = Base::VALID_MMAS_N - 1;
+#pragma unroll
+      for (int col_idx = 0; col_idx < Base::VALID_COLS_PER_THREAD_FOR_LAST_MMA; ++col_idx) {
+        // 2 denotes as fp32 --> bf16
+        float reg0 = acc[0][mma_ni].elt(2 * (col_idx * Base::ROWS_PER_THREAD + row_idx));
+        float reg1 = acc[0][mma_ni].elt(2 * (col_idx * Base::ROWS_PER_THREAD + row_idx) + 1);
+        uint32_t out = fmha::float2_to_bf16_x2(reg0, reg1);
+
+        int64_t offset = (int64_t)row_idx * step_m +
+                         (int64_t)(col_idx + mma_ni * Base::COLS_PER_THREAD) * Base::STEP_N;
+        fmha::stg(this->o_ptr_ + offset, out);
+      }  // row_idx
+    }  // mma_ni
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Cta_tile>
+struct Gmem_tile_o_hopper<
+    fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+    2>  // WARPS_K
+    : public fmha::v2::Hmma_gmem_tile_o<
+          fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+          /*CTAS_PER_HEAD=*/1,
+          /*BYTES_PER_STG=*/16> {
+  using Traits = fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+  using Base = fmha::v2::Hmma_gmem_tile_o<Traits, Cta_tile, 1, 16>;
+
+  template <typename Params, typename Block_info, typename Shared>
+  inline __device__ Gmem_tile_o_hopper(Params const& params, Block_info const& block_info, Shared&&,
+                                       int tidx, int cta_row_offset = 0)
+      : Base(params, block_info, tidx, cta_row_offset) {
+    static_assert(!std::is_same<Shared, int>::value, "Check constructor argument type!");
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Cta_tile>
+struct Gmem_tile_o_hopper<
+    fmha::Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+    2>  // WARPS_K
+    : public fmha::v2::Hmma_gmem_tile_o<
+          fmha::Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+          /*CTAS_PER_HEAD=*/1,
+          /*BYTES_PER_STG=*/16> {
+  using Traits = fmha::Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+  using Base = fmha::v2::Hmma_gmem_tile_o<Traits, Cta_tile, 1, 16>;
+
+  template <typename Params, typename Block_info, typename Shared>
+  inline __device__ Gmem_tile_o_hopper(Params const& params, Block_info const& block_info, Shared&&,
+                                       int tidx, int cta_row_offset = 0)
+      : Base(params, block_info, tidx, cta_row_offset) {
+    static_assert(!std::is_same<Shared, int>::value, "Check constructor argument type!");
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Cta_tile>
+struct Gmem_tile_o_hopper<
+    fmha::Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+    2>  // WARPS_K
+    : public fmha::v2::Hmma_gmem_tile_o<
+          fmha::Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+          /*CTAS_PER_HEAD=*/1,
+          /*BYTES_PER_STG=*/16> {
+  using Traits = fmha::Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+  using Base = fmha::v2::Hmma_gmem_tile_o<Traits, Cta_tile, 1, 16>;
+
+  template <typename Params, typename Block_info, typename Shared>
+  inline __device__ Gmem_tile_o_hopper(Params const& params, Block_info const& block_info, Shared&&,
+                                       int tidx, int cta_row_offset = 0)
+      : Base(params, block_info, tidx, cta_row_offset) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Cta_tile,
+          int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+                   Cta_tile, CTAS_PER_HEAD>
+    : public Gmem_tile_o_hopper<
+          fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+          Cta_tile::WARPS_K> {
+  // The traits class.
+  using Traits = fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+
+  using Base = Gmem_tile_o_hopper<
+      fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+      Cta_tile::WARPS_K>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& block_info, int tidx,
+                                int cta_row_offset = 0)
+      : Base(params, block_info, std::nullptr_t{} /* dummy obj */, tidx, cta_row_offset) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Cta_tile,
+          int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+                   Cta_tile, CTAS_PER_HEAD>
+    : public Gmem_tile_o_hopper<
+          fmha::Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+          Cta_tile::WARPS_K> {
+  // The traits class.
+  using Traits = fmha::Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+
+  using Base = Gmem_tile_o_hopper<
+      fmha::Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+      Cta_tile::WARPS_K>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& block_info, int tidx,
+                                int cta_row_offset = 0)
+      : Base(params, block_info, std::nullptr_t{} /* dummy obj */, tidx, cta_row_offset) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Cta_tile,
+          int CTAS_PER_HEAD>
+struct Gmem_tile_o<fmha::Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+                   Cta_tile, CTAS_PER_HEAD>
+    : public Gmem_tile_o_hopper<
+          fmha::Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+          Cta_tile::WARPS_K> {
+  // The traits class.
+  using Traits = fmha::Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+
+  using Base = Gmem_tile_o_hopper<
+      fmha::Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+      Cta_tile::WARPS_K>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& block_info, int tidx,
+                                int cta_row_offset = 0)
+      : Base(params, block_info, std::nullptr_t{} /* dummy obj */, tidx, cta_row_offset) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, bool USE_TMA_STORE_ = false, int NUM_MATS = 1,
+          bool HEADS_INTERLEAVED = false>
+struct Gmem_tile_o_gmma_32bit_8bit {
+  static_assert(sizeof(typename Traits::Accumulator_type) == 4);
+  static_assert(sizeof(typename Traits::C_type) == 1);
+  // This is for non-splitk GMMA BMM2.
+  static_assert(Cta_tile::WARPS_K == 1);
+  // The associated MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of elements per STG.
+  enum { ELEMENTS_PER_STG = 4 };
+
+  // The size in bytes of each element.
+  enum { BYTES_PER_ELEMENT = 1 };
+
+  // The size of each STG.
+  enum { BYTES_PER_STG = ELEMENTS_PER_STG * BYTES_PER_ELEMENT };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = Cta_tile::VALID_N * BYTES_PER_ELEMENT };
+
+  enum { ROWS = Cta_tile::M };
+
+  // The number of rows accessed by each thread.
+  enum { ROWS_PER_THREAD = Mma_tile::M_PER_MMA / 8 / Cta_tile::WARPS_M };
+
+  static_assert(ROWS_PER_THREAD == 2);
+  static_assert(ROWS_PER_THREAD == Mma_tile::ROWS_PER_THREAD);
+
+  // The number of columns access by each thread.
+  // The number of core matrices in N.
+  enum { COLS_PER_THREAD = Mma_tile::N_PER_MMA / 4 / 2 };  // N_PER_MMA = GMMA_N
+
+  static_assert(COLS_PER_THREAD == Mma_tile::COLS_PER_THREAD / 2);
+  // Assume there is an even number of core matrices, such that we can pack two
+  static_assert(COLS_PER_THREAD % 2 == 0);
+
+  // Number of valid N columns.
+  enum { VALID_N = Cta_tile::VALID_N };
+
+  // The number of valid columns (stored to GMEM) by each thread.
+  enum {
+    VALID_COLS_PER_THREAD_FOR_LAST_MMA =
+        (VALID_N % Mma_tile::N_PER_MMA) == 0 ? COLS_PER_THREAD : (VALID_N % Mma_tile::N_PER_MMA) / 8
+  };
+
+  enum { VALID_MMAS_N = fmha::Div_up<VALID_N, Mma_tile::N_PER_MMA>::VALUE };
+
+  static_assert(VALID_N % 8 == 0, "The valid head dimension needs to be multiple of 8.");
+
+  // The number of N elements must be multiple of 16 in order to pack 4 elements as uint32_t.
+  enum { PACK_4_ELTS = VALID_N % 16 == 0 };
+
+  // The number of accumulator held by each thread, per HGMMA instruction.
+  enum { ELTS_PER_THREAD = ROWS_PER_THREAD * COLS_PER_THREAD * 2 };
+
+  // Currently, we assume for o matrix, GMMA M shape matches CTA M shape.
+  static_assert(Mma_tile::M_PER_MMA == Cta_tile::M &&
+                    Mma_tile::N_PER_MMA * Mma_tile::MMAS_N == Cta_tile::N,
+                "Currently, we assume for o matrix, GMMA M/N shape matches CTA M/N shape. ");
+
+  // Step N for one quad (pack 4 elements for a thread, so 16 elements for a quad)
+  enum { STEP_N = 16 * BYTES_PER_ELEMENT };
+
+  // The number of head_dimension groups.
+  enum { N_GROUPS = fmha::Div_up<Cta_tile::N * BYTES_PER_ELEMENT, 128>::VALUE };
+
+  // The head_dimension per group.
+  enum { N_PER_GROUP = Cta_tile::N / N_GROUPS };
+
+  static_assert(N_GROUPS * N_PER_GROUP == Cta_tile::N);
+
+  // The head_dimension bytes per group
+  enum { N_BYTES_PER_GROUP = Cta_tile::N * BYTES_PER_ELEMENT / N_GROUPS };
+
+  // Pack 2x4 core matrices, use STSMx4
+  enum { STSM_PER_MMA = COLS_PER_THREAD / 4 };
+
+  // The number of registers per 16x16 block
+  enum { REGS_PER_QUAD = 8 };
+
+  // Bytes per bank
+  enum { BYTES_PER_BANK = 16 };
+
+  // The number of banks in N per group
+  enum { N_BANKS_PER_GROUP = N_BYTES_PER_GROUP / BYTES_PER_BANK };
+
+  enum { USE_TMA_STORE = USE_TMA_STORE_ };
+
+  // Ctor.
+  template <typename Params, typename Block_info, typename Shared>
+  inline __device__ Gmem_tile_o_gmma_32bit_8bit(Params const& params, Block_info const& block_info,
+                                                Shared& shared, int tidx, int cta_row_offset = 0)
+      : Gmem_tile_o_gmma_32bit_8bit(
+            params.o_ptr, params.o_stride_in_bytes, block_info, tidx,
+#ifdef GENERATE_CUBIN
+            // Specialized for trt-llm generated cubins only.
+            params.scale_bmm2_d ? *params.scale_bmm2_d : params.scale_bmm2,
+#else
+            params.scale_bmm2,
+#endif
+            cta_row_offset, 0,
+            __nvvm_get_smem_pointer(reinterpret_cast<void*>(
+                &shared.smem_o[__shfl_sync(0xffffffff, threadIdx.x / 128, 0)][0])),
+            &params.tma_desc_o, params.h) {
+  }
+
+  template <typename Block_info>
+  inline __device__ Gmem_tile_o_gmma_32bit_8bit(void* o_ptr, int o_stride_in_bytes,
+                                                Block_info const& block_info, int tidx,
+                                                uint32_t scale_bmm2, int cta_row_offset = 0,
+                                                int mat_offset = 0, uint32_t smem_base = 0,
+                                                cudaTmaDesc const* desc_o = nullptr,
+                                                int head_num = 0)
+      : params_o_stride_in_bytes_(o_stride_in_bytes),
+        actual_seqlen_(block_info.actual_seqlen),
+        o_ptr_(reinterpret_cast<char*>(o_ptr)),
+        params_scale_bmm2_(scale_bmm2),
+        smem_base_(smem_base),
+        desc_o_(desc_o) {
+    // Decompose the position of the thread into warp/lane.
+    int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+    // int warpgroup_idx = warp / 4;
+    int warp_idx_within_warpgroup = warp % 4;
+
+    if (USE_TMA_STORE) {
+      // The head index
+      bidh_ = block_info.bidh;
+      // The lane id
+      lane_ = lane;
+      // The start row index for current batch
+      int row_curr_batch = (block_info.bidx - block_info.bidh) / head_num;
+      // The row index offset of current warp
+      int row_offset_warp = cta_row_offset + warp_idx_within_warpgroup * (Mma_tile::M_PER_MMA / 4);
+      // The row index for the current warp
+      row_tma_ = row_offset_warp + row_curr_batch;
+      // The valid rows for the current warp. Each warp writes from 0 to 16 rows
+      num_valid_rows_ = min(Mma_tile::M_PER_MMA / 4, actual_seqlen_ - row_offset_warp);
+      num_valid_rows_ = max(num_valid_rows_, 0);
+      // WARNING: Without this line, the predicate will not behavior as expected for unknown reason.
+      num_valid_rows_ = __shfl_sync(0xffffffff, num_valid_rows_, 0);
+      // Compute the smem base for STSM
+      smem_base_ +=
+          warp_idx_within_warpgroup * (Mma_tile::M_PER_MMA / 4) * Cta_tile::N * BYTES_PER_ELEMENT +
+          (warp / 4) * Mma_tile::M_PER_MMA * Cta_tile::N * BYTES_PER_ELEMENT;
+      // Compute gmem base for STG in tail case
+      o_ptr_ += row_tma_ * params_o_stride_in_bytes_ + bidh_ * BYTES_PER_ROW;
+    } else {
+      // Compute the position in the sequence (within the CTA for the moment).
+      int row = warp_idx_within_warpgroup * (Mma_tile::M_PER_MMA / 4) + lane / 4;
+      // Store the row to update the predicates in load.
+      row_ = cta_row_offset + row;
+      // Compute the position of the thread in the row.
+      col_ = lane % 4 * ELEMENTS_PER_STG;
+
+      // The offset of the 1st row written by the thread. We store the P matrix interleaved.
+      int64_t row_offset =
+          (int64_t)row_ * params_o_stride_in_bytes_ + block_info.bidx * BYTES_PER_ROW;
+      // Finalize the pointer.
+      o_ptr_ += row_offset + col_ * BYTES_PER_ELEMENT;
+    }
+
+    // REVIEW: need heads_interleaved option for non-warp-specialized QGMMA + LDGSTS kernels.
+    // // The row offset in the batched GEMM. For each seq element, we store QKV in that order.
+    // int64_t row_offset = (int64_t) row_ * params_o_stride_in_bytes_;
+    // // Add the block index.
+    // int64_t idx = block_info.bidx;
+    // if(NUM_MATS > 1) {
+    //     if( HEADS_INTERLEAVED ) {
+    //         idx = block_info.bidx * NUM_MATS + mat_offset;
+    //     } else {
+    //         idx = (block_info.sum_s * NUM_MATS + mat_offset) * block_info.num_heads +
+    //         block_info.bidh;
+    //     }
+    // }
+    // // Assemble the final pointer.
+    // o_ptr_ += row_offset + idx * BYTES_PER_ROW + col * BYTES_PER_ELEMENT;
+  }
+
+  // Store data to memory.
+  template <typename Accumulators, int M, int N>
+  inline __device__ void store(Accumulators const (&acc)[M][N]) {
+    static_assert(Accumulators::NUM_ELTS == ELTS_PER_THREAD);
+    static_assert(COLS_PER_THREAD / 2 * ROWS_PER_THREAD * 4 == ELTS_PER_THREAD);
+
+    // we assume M = N = 1. some shortcuts.
+    static_assert(M == 1);
+
+    if (USE_TMA_STORE) {
+      static_assert(COLS_PER_THREAD % 4 == 0);
+      static_assert(ROWS_PER_THREAD == 2);
+
+      int const swizzled_row = (lane_ % 16);
+      int const swizzled_col = (lane_ / 16);
+      constexpr int max_swizzle_id = N_BYTES_PER_GROUP / 16;
+      constexpr int swizzle_row_divider = 128 / N_BYTES_PER_GROUP;
+
+      uint32_t stsm_addr[VALID_MMAS_N][STSM_PER_MMA];
+// Compute swizzled smem address
+#pragma unroll
+      for (int mma_ni = 0; mma_ni < VALID_MMAS_N; ++mma_ni) {
+#pragma unroll
+        for (int ci = 0; ci < STSM_PER_MMA; ++ci) {
+          int const col_bank = ((mma_ni)*STSM_PER_MMA + ci) * 2 + swizzled_col;
+          int const di = col_bank / N_BANKS_PER_GROUP;  // which N group it belongs to
+          stsm_addr[mma_ni][ci] = smem_base_ + di * 16 * N_BYTES_PER_GROUP +  // group dimension
+                                  (((swizzled_row / swizzle_row_divider) % max_swizzle_id) ^
+                                   (col_bank % N_BANKS_PER_GROUP)) *
+                                      BYTES_PER_BANK +               // column dimension
+                                  swizzled_row * N_BYTES_PER_GROUP;  // row dimension
+        }
+      }
+
+#pragma unroll
+      for (int mma_ni = 0; mma_ni < VALID_MMAS_N; ++mma_ni) {
+#pragma unroll
+        for (int ci = 0; ci < STSM_PER_MMA; ++ci) {
+          uint32_t dst[4];
+          uint4 src[4];
+
+          /*
+           * Each STSMx4 produces a 16x32 block, that is 2x4 core matrices
+           * -----------------
+           * | 0 | 2 | 4 | 6 |
+           * -----------------
+           * | 1 | 3 | 5 | 7 |
+           * -----------------
+           *
+           * Consider the entire warp, src[0] holds matrices 0,2; src[1] holds matrices 1,3;
+           * src[3] holds matrices 4,6; src[4] holds matrices 5,7.
+           */
+          src[0].x = acc[0][mma_ni].reg((ci * 2 + 0) * REGS_PER_QUAD + 0);
+          src[0].y = acc[0][mma_ni].reg((ci * 2 + 0) * REGS_PER_QUAD + 4);
+          src[0].z = acc[0][mma_ni].reg((ci * 2 + 0) * REGS_PER_QUAD + 1);
+          src[0].w = acc[0][mma_ni].reg((ci * 2 + 0) * REGS_PER_QUAD + 5);
+
+          src[1].x = acc[0][mma_ni].reg((ci * 2 + 0) * REGS_PER_QUAD + 2);
+          src[1].y = acc[0][mma_ni].reg((ci * 2 + 0) * REGS_PER_QUAD + 6);
+          src[1].z = acc[0][mma_ni].reg((ci * 2 + 0) * REGS_PER_QUAD + 3);
+          src[1].w = acc[0][mma_ni].reg((ci * 2 + 0) * REGS_PER_QUAD + 7);
+
+          src[2].x = acc[0][mma_ni].reg((ci * 2 + 1) * REGS_PER_QUAD + 0);
+          src[2].y = acc[0][mma_ni].reg((ci * 2 + 1) * REGS_PER_QUAD + 4);
+          src[2].z = acc[0][mma_ni].reg((ci * 2 + 1) * REGS_PER_QUAD + 1);
+          src[2].w = acc[0][mma_ni].reg((ci * 2 + 1) * REGS_PER_QUAD + 5);
+
+          src[3].x = acc[0][mma_ni].reg((ci * 2 + 1) * REGS_PER_QUAD + 2);
+          src[3].y = acc[0][mma_ni].reg((ci * 2 + 1) * REGS_PER_QUAD + 6);
+          src[3].z = acc[0][mma_ni].reg((ci * 2 + 1) * REGS_PER_QUAD + 3);
+          src[3].w = acc[0][mma_ni].reg((ci * 2 + 1) * REGS_PER_QUAD + 7);
+
+          using Src_type = typename Traits::Accumulator_type;
+          using Dst_type = typename Traits::C_type;
+// Packs the 32bit values to 8bit.
+// Depending on the type, applies extra scaling with parameter scale_bmm2.
+#pragma unroll
+          for (int i = 0; i < 4; ++i) {
+#ifdef UNIFIED_EPILOGUE_SCALE
+            dst[i] = Acc_packer<Src_type, Dst_type, false>::run(this, src[i]);
+#else
+            dst[i] = Acc_packer<Src_type, Dst_type, true>::run(this, src[i]);
+#endif
+          }
+          stsm(stsm_addr[mma_ni][ci], *reinterpret_cast<uint4*>(&dst[0]));
+        }
+      }
+
+      // TODO: Interleave STSM and UTMASTG of two N groups
+      constexpr int MAX_ROWS_PER_WARP = Mma_tile::M_PER_MMA / 4;
+      if (num_valid_rows_ == MAX_ROWS_PER_WARP) {
+        fence_view_async_shared();
+#pragma unroll
+        for (int di = 0; di < N_GROUPS; ++di) {
+          const int32_t coords[3] = {di * N_PER_GROUP, bidh_, row_tma_};
+          fmha::utmastg<3, fmha::cudaTmaDescType::TILED>(
+              desc_o_, smem_base_ + di * 16 * N_BYTES_PER_GROUP, coords);
+        }
+        tmastg_arrive();
+        tmastg_wait();
+      } else if (num_valid_rows_ > 0) {
+        // Use LDS.64 + STG.64 to store num_valid_rows_ x N tile
+        constexpr int BYTES_PER_THREAD = 8;
+        static_assert((VALID_N % BYTES_PER_THREAD) == 0, "VALID_N must be divided by 8 for STG.64");
+        // Number of valid rows
+        int row_size = num_valid_rows_;
+        // Number of threads per row. Each thread read/write 8B (8 elements).
+        constexpr int THREADS_PER_ROW = N_BYTES_PER_GROUP / 8;
+        // Number of rows read/written by a warp
+        static_assert(Cta_tile::THREADS_PER_WARP % THREADS_PER_ROW == 0,
+                      "A warp must reads full rows");
+        constexpr int ROWS_PER_WARP = Cta_tile::THREADS_PER_WARP / THREADS_PER_ROW;
+        // GMEM stride in M dimension
+        int64_t const step_m = (this->params_o_stride_in_bytes_);
+        // Initial column index
+        int const ci = lane_ % THREADS_PER_ROW;
+        int const bank_idx = (ci * BYTES_PER_THREAD) / BYTES_PER_BANK;
+        int const bank_offset = (ci * BYTES_PER_THREAD) % BYTES_PER_BANK;
+
+#pragma unroll
+        for (int di = 0; di < N_GROUPS; ++di) {
+          // Detect GMEM index out of bound
+          if ((di * N_BYTES_PER_GROUP + ci * BYTES_PER_THREAD) >= BYTES_PER_ROW) {
+            break;
+          }
+#pragma unroll
+          for (int ri = lane_ / THREADS_PER_ROW; ri < row_size; ri += ROWS_PER_WARP) {
+            // Create the swizzled offset
+            uint32_t smem_offset =
+                di * 16 * N_BYTES_PER_GROUP + ri * N_BYTES_PER_GROUP +
+                (((ri / swizzle_row_divider) % max_swizzle_id) ^ bank_idx) * BYTES_PER_BANK +
+                bank_offset;
+            uint2 buffer;
+            fmha::lds(buffer, smem_base_ + smem_offset);
+            int64_t gmem_offset =
+                (int64_t)ri * step_m + di * N_BYTES_PER_GROUP + ci * BYTES_PER_THREAD;
+            fmha::stg(o_ptr_ + gmem_offset, buffer);
+          }
+        }
+      }
+    } else {
+      int64_t const step_m = 8 * (this->params_o_stride_in_bytes_);
+
+#pragma unroll
+      for (int ri = 0; ri < ROWS_PER_THREAD; ++ri) {
+        if (row_ + ri * 8 >= actual_seqlen_) {
+          break;
+        }
+
+#pragma unroll
+        for (int mma_ni = 0; mma_ni < VALID_MMAS_N - 1; ++mma_ni) {
+// Iterate over 16 columns to pack 4 values per thread.
+#pragma unroll
+          for (int ci = 0; ci < COLS_PER_THREAD / 2; ++ci) {
+            // Assuming EVEN,EVEN,ODD,ODD column pattern due to packing of V.
+            uint4 src;
+            src.x = acc[0][mma_ni].reg(((2 * ci + 0) * ROWS_PER_THREAD + ri) * 2 + 0);  // 0
+            src.y = acc[0][mma_ni].reg(((2 * ci + 1) * ROWS_PER_THREAD + ri) * 2 + 0);  // 4
+            src.z = acc[0][mma_ni].reg(((2 * ci + 0) * ROWS_PER_THREAD + ri) * 2 + 1);  // 1
+            src.w = acc[0][mma_ni].reg(((2 * ci + 1) * ROWS_PER_THREAD + ri) * 2 + 1);  // 5
+
+            using Src_type = typename Traits::Accumulator_type;
+            using Dst_type = typename Traits::C_type;
+            // Packs the 32bit values to 8bit.
+            // Depending on the type, applies extra scaling with parameter scale_bmm2.
+#ifdef UNIFIED_EPILOGUE_SCALE
+            uint32_t dst = Acc_packer<Src_type, Dst_type, false>::run(this, src);
+#else
+            uint32_t dst = Acc_packer<Src_type, Dst_type, true>::run(this, src);
+#endif
+
+            int64_t offset =
+                (int64_t)ri * step_m + (int64_t)(ci + mma_ni * COLS_PER_THREAD / 2) * STEP_N;
+            fmha::stg(o_ptr_ + offset, dst);
+          }  // ci
+        }  // mma_ni
+
+        if constexpr (PACK_4_ELTS) {
+          // The last mma_n may not store full elements back to GMEM.
+          int mma_ni = VALID_MMAS_N - 1;
+// Iterate over 16 columns to pack 4 values per thread.
+#pragma unroll
+          for (int ci = 0; ci < VALID_COLS_PER_THREAD_FOR_LAST_MMA / 2; ++ci) {
+            // Assuming EVEN,EVEN,ODD,ODD column pattern due to packing of V.
+            uint4 src;
+            src.x = acc[0][mma_ni].reg(((2 * ci + 0) * ROWS_PER_THREAD + ri) * 2 + 0);  // 0
+            src.y = acc[0][mma_ni].reg(((2 * ci + 1) * ROWS_PER_THREAD + ri) * 2 + 0);  // 4
+            src.z = acc[0][mma_ni].reg(((2 * ci + 0) * ROWS_PER_THREAD + ri) * 2 + 1);  // 1
+            src.w = acc[0][mma_ni].reg(((2 * ci + 1) * ROWS_PER_THREAD + ri) * 2 + 1);  // 5
+
+            using Src_type = typename Traits::Accumulator_type;
+            using Dst_type = typename Traits::C_type;
+            // Packs the 32bit values to 8bit.
+            // Depending on the type, applies extra scaling with parameter scale_bmm2.
+#ifdef UNIFIED_EPILOGUE_SCALE
+            uint32_t dst = Acc_packer<Src_type, Dst_type, false>::run(this, src);
+#else
+            uint32_t dst = Acc_packer<Src_type, Dst_type, true>::run(this, src);
+#endif
+
+            int64_t offset =
+                (int64_t)ri * step_m + (int64_t)(ci + mma_ni * COLS_PER_THREAD / 2) * STEP_N;
+            fmha::stg(o_ptr_ + offset, dst);
+          }  // ci
+        } else {
+          // The last mma_n may not store full elements back to GMEM.
+          int mma_ni = VALID_MMAS_N - 1;
+// Iterate over 16 columns to pack 4 values per thread (2 uint2).
+#pragma unroll
+          for (int ci = 0; ci < fmha::Div_up<VALID_COLS_PER_THREAD_FOR_LAST_MMA, 2>::VALUE; ++ci) {
+            // Assuming EVEN,EVEN,ODD,ODD column pattern due to packing of V.
+            uint2 src0, src1;
+            src0.x = acc[0][mma_ni].reg(((2 * ci + 0) * ROWS_PER_THREAD + ri) * 2 + 0);  // 0
+            src0.y = acc[0][mma_ni].reg(((2 * ci + 1) * ROWS_PER_THREAD + ri) * 2 + 0);  // 4
+            src1.x = acc[0][mma_ni].reg(((2 * ci + 0) * ROWS_PER_THREAD + ri) * 2 + 1);  // 1
+            src1.y = acc[0][mma_ni].reg(((2 * ci + 1) * ROWS_PER_THREAD + ri) * 2 + 1);  // 5
+
+            using Src_type = typename Traits::Accumulator_type;
+            using Dst_type = typename Traits::C_type;
+#ifdef UNIFIED_EPILOGUE_SCALE
+            uint16_t dst0 = Acc_packer<Src_type, Dst_type, false>::run(this, src0);
+            uint16_t dst1 = Acc_packer<Src_type, Dst_type, false>::run(this, src1);
+#else
+            uint16_t dst0 = Acc_packer<Src_type, Dst_type, true>::run(this, src0);
+            uint16_t dst1 = Acc_packer<Src_type, Dst_type, true>::run(this, src1);
+#endif
+
+            // 4 elements per thread, so 16 elements per loop.
+            int col_idx = (ci + mma_ni * COLS_PER_THREAD / 2) * 16;
+
+            int64_t offset = (int64_t)ri * step_m + (int64_t)(col_idx)*BYTES_PER_ELEMENT;
+
+            if (col_idx + col_ < VALID_N) {
+              fmha::stg(o_ptr_ + offset, dst0);
+            }
+
+            if (col_idx + col_ + 2 < VALID_N) {
+              fmha::stg(o_ptr_ + offset + 2 * BYTES_PER_ELEMENT, dst1);
+            }
+          }  // ci
+        }
+      }  // ri
+    }
+  }
+
+  // Move to the next location.
+  inline __device__ void move() {
+    row_ += ROWS;
+    o_ptr_ += (int64_t)ROWS * params_o_stride_in_bytes_;
+  }
+
+  // The stride between rows for the QKV matrice.
+  int64_t params_o_stride_in_bytes_;
+  // The pointer.
+  char* o_ptr_;
+  // Is the thread active for the last STG?
+  int is_active_for_last_stg_;
+
+  // The row, col loaded by this thread.
+  int row_, col_;
+  // The length of the sequence loaded by that CTA.
+  int actual_seqlen_;
+
+  // Scaling factor; this usually means QKV descale factor in actuality
+  uint32_t params_scale_bmm2_;
+
+  // Smem buffer for TMASTG
+  uint32_t smem_base_;
+  cudaTmaDesc const* desc_o_;
+
+  int lane_;
+  int row_tma_;
+  int num_valid_rows_;
+  int bidh_;
+
+  bool const params_enable_i2f_trick_ = false;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int WARPS_K, bool USE_TMA_STORE = false>
+struct Gmem_tile_o_hopper_32bit_8bit {};
+
+template <typename Traits, typename Cta_tile, bool USE_TMA_STORE>
+struct Gmem_tile_o_hopper_32bit_8bit<Traits, Cta_tile, 1, USE_TMA_STORE>
+    : public Gmem_tile_o_gmma_32bit_8bit<Traits, Cta_tile, USE_TMA_STORE> {
+  // The Base class.
+  using Base = Gmem_tile_o_gmma_32bit_8bit<Traits, Cta_tile, USE_TMA_STORE>;
+
+  // Ctor.
+  template <typename Params, typename Block_info, typename Shared>
+  inline __device__ Gmem_tile_o_hopper_32bit_8bit(Params const& params,
+                                                  Block_info const& block_info, Shared& shared,
+                                                  int tidx, int cta_row_offset = 0)
+      : Base(params, block_info, shared, tidx, cta_row_offset) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, bool USE_TMA_STORE>
+struct Gmem_tile_o_hopper_32bit_8bit<Traits, Cta_tile, 2, USE_TMA_STORE>
+    : public Gmem_tile_o_8bit<Traits, Cta_tile, /*CTAS_PER_HEAD=*/1> {
+  // The Base class.
+  using Base = Gmem_tile_o_8bit<Traits, Cta_tile, /*CTAS_PER_HEAD=*/1>;
+
+  // Ctor.
+  template <typename Params, typename Block_info, typename Shared>
+  inline __device__ Gmem_tile_o_hopper_32bit_8bit(Params const& params,
+                                                  Block_info const& block_info, Shared& shared,
+                                                  int tidx, int cta_row_offset = 0)
+      : Base(params, block_info, shared, tidx, cta_row_offset) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Cta_tile,
+          int CTAS_PER_HEAD>
+struct Gmem_tile_o_hopper<
+    fmha::Hopper_qgmma_fp8_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+    CTAS_PER_HEAD>
+    : public Gmem_tile_o_hopper_32bit_8bit<
+          fmha::Hopper_qgmma_fp8_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+          Cta_tile, Cta_tile::WARPS_K> {
+  // The traits class.
+  using Traits = fmha::Hopper_qgmma_fp8_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+
+  using Base = Gmem_tile_o_hopper_32bit_8bit<
+      fmha::Hopper_qgmma_fp8_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+      Cta_tile::WARPS_K>;
+
+  // Ctor.
+  template <typename Params, typename Block_info, typename Shared>
+  inline __device__ Gmem_tile_o_hopper(Params const& params, Block_info const& block_info,
+                                       Shared& shared, int tidx, int cta_row_offset = 0)
+      : Base(params, block_info, shared, tidx, cta_row_offset) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Cta_tile,
+          int CTAS_PER_HEAD>
+struct Gmem_tile_o<
+    fmha::Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+    CTAS_PER_HEAD>
+    : public Gmem_tile_o_hopper_32bit_8bit<
+          fmha::Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+          Cta_tile, Cta_tile::WARPS_K> {
+  // The traits class.
+  using Traits = fmha::Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+
+  using Base = Gmem_tile_o_hopper_32bit_8bit<
+      fmha::Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+      Cta_tile::WARPS_K>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_o(Params const& params, Block_info const& block_info, int tidx,
+                                int cta_row_offset = 0)
+      : Base(params, block_info, std::nullptr_t{} /* dummy obj */, tidx, cta_row_offset) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, typename Output_type = bf16_t>
+struct Gmem_tile_o_qgmma_fp32_16bits {
+  // The associated MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of elements per STG.
+  enum { ELEMENTS_PER_STG = 2 };
+
+  // The size in bytes of each element.
+  enum { BYTES_PER_ELEMENT = 2 };
+
+  // The size of each STG.
+  enum { BYTES_PER_STG = ELEMENTS_PER_STG * BYTES_PER_ELEMENT };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = Cta_tile::VALID_N * BYTES_PER_ELEMENT };
+
+  // The number of rows accessed by each thread.
+  enum { ROWS_PER_THREAD = Mma_tile::M_PER_MMA / 8 / Cta_tile::WARPS_PER_CTA };
+
+  enum { ROWS = Cta_tile::M };
+
+  // The number of columns access by each thread.
+  // Note there are 2 elements per reg.
+  enum { COLS_PER_THREAD = Mma_tile::N_PER_MMA / 4 / 2 };
+
+  // The number of valid columns (stored to GMEM) by each thread.
+  enum {
+    VALID_COLS_PER_THREAD_FOR_LAST_MMA = (Cta_tile::VALID_N % Mma_tile::N_PER_MMA) == 0
+                                             ? COLS_PER_THREAD
+                                             : (Cta_tile::VALID_N % Mma_tile::N_PER_MMA) / 8
+  };
+
+  enum { VALID_MMAS_N = fmha::Div_up<Cta_tile::VALID_N, Mma_tile::N_PER_MMA>::VALUE };
+
+  static_assert(Cta_tile::VALID_N % 8 == 0, "The valid head dimension needs to be multiple of 8.");
+
+  // The number of accumulator held by each thread, per HGMMA instruction.
+  enum { ELTS_PER_THREAD = ROWS_PER_THREAD * COLS_PER_THREAD };
+
+  // Currently, we assume for o matrix, GMMA M/N shape matches CTA M/N shape.
+  static_assert(Mma_tile::M_PER_MMA == Cta_tile::M &&
+                    Mma_tile::N_PER_MMA * Mma_tile::MMAS_N == Cta_tile::N,
+                "Currently, we assume for o matrix, GMMA M shape matches CTA M shape. ");
+
+  // Step N for one quad
+  enum { STEP_N = 8 * BYTES_PER_ELEMENT };
+
+  // Ctor.
+  template <typename Params, typename Block_info, typename Shared>
+  inline __device__ Gmem_tile_o_qgmma_fp32_16bits(Params const& params,
+                                                  Block_info const& block_info, Shared&&, int tidx,
+                                                  int cta_row_offset = 0)
+      : params_o_stride_in_bytes_(params.o_stride_in_bytes),
+        params_scale_bmm2_(
+#ifdef GENERATE_CUBIN
+            // Specialized for trt-llm generated cubins only.
+            params.scale_bmm2_d ? *params.scale_bmm2_d : params.scale_bmm2
+#else
+            params.scale_bmm2
+#endif
+            ),
+        actual_seqlen_(block_info.actual_seqlen),
+        o_ptr_(reinterpret_cast<char*>(params.o_ptr)) {
+    static_assert(!std::is_same<Shared, int>::value, "Check constructor argument type!");
+    // Decompose the position of the thread into warp/lane.
+    int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+    int warp_idx_within_warpgroup = warp % 4;
+
+    // Compute the position in the sequence (within the CTA for the moment).
+    int row = warp_idx_within_warpgroup * (Mma_tile::M_PER_MMA / 4) + lane / 4;
+    // Store the row to update the predicates in load.
+    row_ = cta_row_offset + row;
+    // Compute the position of the thread in the row.
+    // echo loop handles 2 cores, so x2 (this is the difference to Gmem_tile_o_hopper_16bits)
+    int col = lane % 4 * ELEMENTS_PER_STG * 2;
+
+    // The offset of the 1st row written by the thread. We store the P matrix interleaved.
+    int64_t row_offset =
+        (int64_t)row_ * params_o_stride_in_bytes_ + block_info.bidx * BYTES_PER_ROW;
+    // Finalize the pointer.
+    o_ptr_ += row_offset + col * BYTES_PER_ELEMENT;
+  }
+
+  // Store data to memory.
+  template <typename Accumulators, int M, int N>
+  inline __device__ void store(Accumulators const (&acc)[M][N]) {
+    int64_t const step_m = 8 * params_o_stride_in_bytes_;
+#ifdef UNIFIED_EPILOGUE_SCALE
+    constexpr bool Scale = false;
+#else
+    constexpr bool Scale = true;
+#endif
+#define STORE_COLUMNS()                                                                         \
+  {                                                                                             \
+    /* we assume M = 1. some shortcuts. */                                                      \
+    static_assert(M == 1);                                                                      \
+    uint4 _src = {                                                                              \
+        .x = acc[0][mma_ni].reg(((ci + 0) * ROWS_PER_THREAD + ri) * 2),                         \
+        .y = acc[0][mma_ni].reg(((ci + 1) * ROWS_PER_THREAD + ri) * 2),                         \
+        .z = acc[0][mma_ni].reg(((ci + 0) * ROWS_PER_THREAD + ri) * 2 + 1),                     \
+        .w = acc[0][mma_ni].reg(((ci + 1) * ROWS_PER_THREAD + ri) * 2 + 1),                     \
+    };                                                                                          \
+    uint2 _dst = Acc_packer<float, Output_type, Scale>::run(this, _src);                        \
+    int64_t _offset = (int64_t)ri * step_m + (int64_t)(ci + mma_ni * COLS_PER_THREAD) * STEP_N; \
+    fmha::stg(o_ptr_ + _offset, _dst);                                                          \
+  }
+
+#pragma unroll
+    for (int ri = 0; ri < ROWS_PER_THREAD; ri++) {
+      if (row_ + ri * 8 >= actual_seqlen_) {
+        break;
+      }
+#pragma unroll
+      for (int mma_ni = 0; mma_ni < VALID_MMAS_N - 1; ++mma_ni) {
+#pragma unroll
+        for (int ci = 0; ci < COLS_PER_THREAD; ci += 2) {
+          STORE_COLUMNS()
+        }
+      }
+      // The last mma_n may not store full elements back to GMEM.
+      int mma_ni = VALID_MMAS_N - 1;
+#pragma unroll
+      for (int ci = 0; ci < VALID_COLS_PER_THREAD_FOR_LAST_MMA; ci += 2) {
+        STORE_COLUMNS()
+      }
+    }
+  }
+
+  // Move to the next location.
+  inline __device__ void move() {
+    row_ += ROWS;
+    o_ptr_ += (int64_t)ROWS * params_o_stride_in_bytes_;
+  }
+
+  // The stride between rows for the QKV matrice.
+  int64_t params_o_stride_in_bytes_;
+  // Scaling factor; this usually means QKV descale factor in actuality
+  uint32_t params_scale_bmm2_;
+  // The pointer.
+  char* o_ptr_;
+  // The row loaded by this thread.
+  int row_;
+  // The length of the sequence loaded by that CTA.
+  int actual_seqlen_;
+};
+
+}  // namespace v2
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/hopper/gmem_tile_qkv_packed.h b/csrc/fmha_v2/fmha/hopper/gmem_tile_qkv_packed.h
new file mode 100644
index 0000000000..5ee0ac50d1
--- /dev/null
+++ b/csrc/fmha_v2/fmha/hopper/gmem_tile_qkv_packed.h
@@ -0,0 +1,146 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+#include <fmha/traits.h>
+
+namespace fmha {
+namespace v2 {
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The number of bits per element.
+    int BITS_PER_ELEMENT,
+    // The number of rows of Q, K or V loaded by this tile.
+    int ROWS_,
+    // The number of columns.
+    int COLS,
+    // Do we use LDGSTS?
+    bool USE_LDGSTS_,
+    // Are attention heads interleaved?
+    bool HEADS_INTERLEAVED,
+    // The number of matrices
+    int NUM_MATS = 3>
+struct Gmem_tile_tma_qkv {
+  // The size of each LDG.
+  enum { BYTES_PER_LDG = 16 };
+
+  // The size of a row in bytes.
+  enum { BYTES_PER_ROW = COLS * BITS_PER_ELEMENT / 8 };
+
+  // The number of threads to load a "row" of the matrix.
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDG };
+
+  // The number of "rows" loaded per LDG.
+  enum { ROWS_PER_LDG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // The number of rows.
+  enum { ROWS = ROWS_ };
+
+  // The number of LDGs needed to load a chunk of the Q matrix.
+  enum { LDGS = fmha::Div_up<ROWS, ROWS_PER_LDG>::VALUE };
+
+  // The number of predicate registers.
+  enum { PRED_REGS = fmha::Compute_number_of_pred_regs<LDGS>::VALUE };
+
+  // Is it Hopper?
+  enum {
+    IS_HOPPER = std::is_same<typename Traits::Gpu_arch, typename fmha::Hopper>::value == true
+  };
+
+  // Make sure we use a single register to store predicates. Do not throw for Hopper for now.
+  static_assert(!USE_LDGSTS_ || PRED_REGS == 1 || IS_HOPPER, "");
+
+  // We do not use LDGSTS (for the moment).
+  enum { USE_LDGSTS = USE_LDGSTS_ };
+
+  // TMA DIMS, hard coded for now
+  enum { TMA_DIMS = 3 };
+
+  // TMA DESC type, hard coded for now
+  static constexpr fmha::cudaTmaDescType TMA_DESC_TYPE = fmha::cudaTmaDescType::TILED;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_tma_qkv(Params const& params, cudaTmaDesc const* p_desc,
+                                      int qkv_offset, Block_info const& block_info, int tidx,
+                                      int cta_row_offset = 0)
+      // in PACKED_QKV, q_stride = k_stride = v_stride
+      : params_qkv_stride_in_bytes_(params.q_stride_in_bytes),
+        actual_seqlen_(block_info.actual_seqlen),
+        qkv_ptr_(reinterpret_cast<char*>(params.qkv_ptr)),
+        p_desc_(p_desc) {
+    // Both MQA and GQA will use non HEADS_INTERLEAVED layout
+    if (params.h_kv < params.h) {
+      // QKV layout [b, s, [q_hd, k_h'd, v_h'd]]
+      int const hi = block_info.bidh;
+      int const hi_kv = block_info.bidh / (params.h / params.h_kv);
+      if (qkv_offset == 0) {  // Q tensor
+        coord[0] = hi * params.d;
+      } else if (qkv_offset == 1) {  // K tensor
+        coord[0] = params.h * params.d + hi_kv * params.d;
+      } else if (qkv_offset == 2) {  // V tensor
+        coord[0] = params.h * params.d + params.h_kv * params.d + hi_kv * params.d;
+      }
+    } else {
+      coord[0] = qkv_offset * params.d + block_info.bidh * params.d * 3;
+    }
+    // coord[1] = block_info.bidb * params.s; // should be params.s * batch_idx
+    // coord[1] do not need to be adjusted per batch.
+    // since the gmem_ptr in tma desc is set per batch and already adjusted.
+    coord[1] = block_info.sum_s;
+    coord[2] = 0;
+  }
+
+  // Store data to shared memory.
+  template <typename Smem_tile>
+  inline __device__ void commit(Smem_tile& smem_tile) {}
+
+  // Load data from memory.
+  template <typename Smem_tile>
+  inline __device__ void load(Smem_tile& smem_tile) {
+    smem_tile.template store<TMA_DIMS, TMA_DESC_TYPE>(p_desc_, coord);
+  }
+
+  // Store data to memory.
+  inline __device__ void store(uint4 const (&data)[LDGS]) {}
+
+  // Move the pointer to the next location.
+  // only needed by matrix Q.
+  inline __device__ void move() {
+    // coord[1] is incremented by STEP size.
+    coord[1] += ROWS;
+  }
+
+  // The stride between rows for the QKV matrice.
+  int64_t params_qkv_stride_in_bytes_;
+  // The pointer.
+  char* qkv_ptr_;
+  // The register to store predicates.
+  uint32_t preds_[PRED_REGS];
+  // The fetch registers.
+  uint4 fetch_[LDGS];
+  // Keep track of the row the thread is processing as we move the tile.
+  int row_;
+  // The sequence length.
+  int actual_seqlen_;
+  // tma descriptor
+  cudaTmaDesc const* p_desc_;
+  // coord use by TMA. For now hard code to 3D.
+  int32_t coord[3];
+};
+
+}  // namespace v2
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/hopper/gmma_descriptor.h b/csrc/fmha_v2/fmha/hopper/gmma_descriptor.h
new file mode 100644
index 0000000000..8b4129e343
--- /dev/null
+++ b/csrc/fmha_v2/fmha/hopper/gmma_descriptor.h
@@ -0,0 +1,547 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+namespace fmha {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// whether transpose is applied on the smem before GMMA math execution
+// if TN, notrans is applied to both A and B. as GMMA expects the data
+// to be in TN format.
+// if NT, trans is applied to both A and B.
+////////////////////////////////////////////////////////////////////////////////////////////////////
+enum class Gmma_descriptor_transpose { TRANS, NOTRANS };
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Gmma descriptor mode
+// 2 bits to specify the descriptor mode.
+////////////////////////////////////////////////////////////////////////////////////////////////////
+enum class Gmma_descriptor_mode { SWIZZLE_NONE = 0, SWIZZLE_128B, SWIZZLE_64B, SWIZZLE_32B };
+constexpr uint32_t GMMA_DESCRIPTOR_MODE_BITS = 2;
+constexpr uint32_t GMMA_DESCRIPTOR_MODE_SHIFT = 62;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// number of descriptor per GMMA group to be actually allocated per kblock
+////////////////////////////////////////////////////////////////////////////////////////////////////
+enum class Gmma_descriptor_size {
+  ONE,
+  TWO,  // not yet implemented. might be needed for 64xNxK tile size.
+  // as many as needed (kblock / gmma_k). we may not prefer to use this as we may run out of UR
+  // budget
+  ALL
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// a single desc that has the info and bits
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <Gmma_descriptor_transpose Gmma_trans, Gmma_descriptor_mode Gmma_mode>
+class Single_descriptor {
+ public:
+  // trans mode
+  static constexpr Gmma_descriptor_transpose TRANS_MODE = Gmma_trans;
+
+  // set the single desc
+  inline __device__ void set(uint64_t const& desc_) { desc = desc_; }
+
+  // get the single desc
+  inline __device__ uint64_t get() const { return desc; }
+
+ private:
+  // the descriptor, each of 64 bit
+  uint64_t desc;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// for a
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <Gmma_descriptor_transpose Gmma_trans, Gmma_descriptor_mode Gmma_mode, typename Cta_tile,
+          int BITS_PER_ELEMENT, int GMMA_M, int GMMA_N, int GMMA_K,
+          // number of desc actually allocated.
+          Gmma_descriptor_size Gmma_vector_size>
+class Gmma_descriptor_a {
+ public:
+  // The type of the Single Descriptor
+  using Single_desc = Single_descriptor<Gmma_trans, Gmma_mode>;
+
+  // Transpose Mode
+  static constexpr Gmma_descriptor_transpose TRANS_MODE = Gmma_trans;
+
+  // The number of descriptors per 64xNblockxKblock.
+  static constexpr Gmma_descriptor_size GMMA_DESC_SIZE_PER_GROUP = Gmma_vector_size;
+
+  // Currently the number of descriptors per 64xNblockxKblock is always One
+  // Historically we have supported more descriptors. But that has proven to
+  // be less performant as it consumes too many uniform registers.
+  // During the process of refactoring we have decided to only support allocating
+  // one desc per 64xNblockxKblock. If needed in the future, we can support
+  // more desc.
+  static_assert(Gmma_vector_size == Gmma_descriptor_size::ONE,
+                "Currently, only Mblock/64 desc is allocated per kgroup\n");
+
+  // Interleaved Mode is currently not supported.
+  // static_assert to avoid accidentally instantiate it.
+  static_assert(Gmma_mode != Gmma_descriptor_mode::SWIZZLE_NONE,
+                "Currently, SWIZZLE_NONE mode is not implemented. \n");
+
+  // byte per leading dim (row if TN, column is NT) must be 128
+  enum { BYTES_PER_LEADING_DIM = 128 };
+
+  // bytes per element
+  enum { BYTES_PER_ELEMENT = BITS_PER_ELEMENT / 8 };
+
+  // the number of descriptors per kblock is related to GMMA shape and kblock size
+  enum {
+    NUM_DESCRIPTORS = (Gmma_vector_size == Gmma_descriptor_size::ALL) ? Cta_tile::K / GMMA_K : 1
+  };
+
+  // the number of descriptors per 128 byte in k dimension (leading dim)
+  // NUM_DESCRIPTORS_PER_128B_IN_K is really only needed if leading dim is K
+  enum {
+    NUM_DESCRIPTORS_PER_128B_IN_K = (Gmma_mode == Gmma_descriptor_mode::SWIZZLE_128B &&
+                                     Gmma_trans == Gmma_descriptor_transpose::NOTRANS)
+                                        ? BYTES_PER_LEADING_DIM / ((GMMA_K * BITS_PER_ELEMENT) / 8)
+                                        : NUM_DESCRIPTORS
+  };
+
+  static constexpr uint32_t BYTES_PER_GMMA_K = GMMA_K * BITS_PER_ELEMENT / 8;  // 32B
+
+  // the distance between neighboring descriptors
+  static constexpr uint32_t BYTES_PER_DESC =
+      Gmma_vector_size == Gmma_descriptor_size::ALL ? 0
+      : Gmma_trans == Gmma_descriptor_transpose::TRANS
+          ? Gmma_mode == Gmma_descriptor_mode::SWIZZLE_128B  ? GMMA_K * BYTES_PER_LEADING_DIM
+            : Gmma_mode == Gmma_descriptor_mode::SWIZZLE_64B ? (GMMA_K / 2) * BYTES_PER_LEADING_DIM
+            : Gmma_mode == Gmma_descriptor_mode::SWIZZLE_32B ? (GMMA_K / 4) * BYTES_PER_LEADING_DIM
+                                                             : 0
+      : Gmma_trans == Gmma_descriptor_transpose::NOTRANS
+          ? Gmma_mode == Gmma_descriptor_mode::SWIZZLE_128B ||
+                    Gmma_mode == Gmma_descriptor_mode::SWIZZLE_64B
+                ? BYTES_PER_GMMA_K  // 32B
+            : Gmma_mode == Gmma_descriptor_mode::SWIZZLE_32B ? Cta_tile::M * BYTES_PER_GMMA_K
+                                                             : 0
+          : 0;
+
+  // the distance between neighboring desc without 4LSB
+  static constexpr uint32_t BYTES_PER_DESC_NO_4LSB = BYTES_PER_DESC >> 4;
+
+  // the distance to travel back from the last desc to the first desc within a group
+  enum { BYTES_DESC_INC_BOUNDARY_NO_4LSB = BYTES_PER_DESC_NO_4LSB * (Cta_tile::K / GMMA_K - 1) };
+
+  // set GMMA descriptor mode bits.
+  static constexpr uint64_t DESCRIPTOR_MODE_IN_BIT_LOCATION =
+      (static_cast<uint64_t>(Gmma_mode) & ((1u << GMMA_DESCRIPTOR_MODE_BITS) - 1))
+      << GMMA_DESCRIPTOR_MODE_SHIFT;
+
+  // stride byte offset, bit 32-45, 4LSB not included
+  // each row is always of 128 byte. 8 rows always.
+  // divide by 16 since the 4 LSB is not included
+  static constexpr uint64_t STRIDE_BYTE_OFFSET =
+      BYTES_PER_LEADING_DIM *
+      ((Gmma_mode == Gmma_descriptor_mode::SWIZZLE_128B) ? 8
+       : Gmma_mode == Gmma_descriptor_mode::SWIZZLE_64B  ? 4
+                                                         : 2) /
+      16;
+  // shift 32 bit
+  static constexpr uint64_t STRIDE_BYTE_OFFSET_IN_BIT_LOCATION = STRIDE_BYTE_OFFSET << 32;
+
+  // leading byte offset, bit 16-29, 4LSB not included
+  // each row is still 128 byte.
+  // divide by 16 since the 4 LSB is not included
+  // for A matrix of TN, and the way we reshape the matrix, LEADING_BYTE_OFFSET is never non-zero
+  // in the future with different GMMA shape, this might be needed
+  static constexpr bool LEADING_BYTE_OFFSET_NEEDED = false;
+
+  // the leading byte offset if needed 4LSB not included
+  static constexpr uint64_t LEADING_BYTE_OFFSET =
+      Gmma_mode == Gmma_descriptor_mode::SWIZZLE_32B
+          ? BYTES_PER_LEADING_DIM / 16
+          : BYTES_PER_LEADING_DIM *
+                ((Gmma_trans == Gmma_descriptor_transpose::TRANS) ? Cta_tile::K : Cta_tile::M) / 16;
+  // shift 16 bit
+  static constexpr uint64_t LEADING_BYTE_OFFSET_IN_BIT_LOCATION =
+      LEADING_BYTE_OFFSET_NEEDED ? LEADING_BYTE_OFFSET << 16 : 0;
+
+  // ctor
+  inline __device__ Gmma_descriptor_a() {
+#pragma unroll
+    for (int desc_idx = 0; desc_idx < NUM_DESCRIPTORS; ++desc_idx) {
+      desc[desc_idx] = 0;
+    }
+
+// set bit 62-63 to 1 for SWIZZLE_128B format
+// set bit 62-63 to 2 for SWIZZLE_64B format
+#pragma unroll
+    for (int desc_idx = 0; desc_idx < NUM_DESCRIPTORS; ++desc_idx) {
+      desc[desc_idx] |= DESCRIPTOR_MODE_IN_BIT_LOCATION;
+    }
+
+// stride byte offset, bit 32-45, 4LSB not included
+#pragma unroll
+    for (int desc_idx = 0; desc_idx < NUM_DESCRIPTORS; ++desc_idx) {
+      desc[desc_idx] |= STRIDE_BYTE_OFFSET_IN_BIT_LOCATION;
+    }
+
+    // leading byte offset, bit 16-29, 4LSB not included
+    if (LEADING_BYTE_OFFSET_NEEDED) {
+#pragma unroll
+      for (int desc_idx = 0; desc_idx < NUM_DESCRIPTORS; ++desc_idx) {
+        desc[desc_idx] |= LEADING_BYTE_OFFSET_IN_BIT_LOCATION;
+      }
+    }
+  }
+
+  // update the descriptor based on smem address. Should be called once from prologue.
+  inline __device__ void set_smem_pointer(uint32_t smem_nvvm_pointer) {
+    // uint32_t smem_nvvm_pointer = get_smem_pointer(smem);
+    uint64_t smem_address_bit = static_cast<uint64_t>(smem_nvvm_pointer);
+
+    // set base offset, bit 49-61
+    uint64_t offset = (smem_address_bit / BYTES_PER_LEADING_DIM) %
+                      ((Gmma_mode == Gmma_descriptor_mode::SWIZZLE_128B) ? 8
+                       : Gmma_mode == Gmma_descriptor_mode::SWIZZLE_64B  ? 4
+                                                                         : 2);
+    uint64_t offset_in_bit_location = offset << 49;
+#pragma unroll
+    for (int desc_idx = 0; desc_idx < NUM_DESCRIPTORS; ++desc_idx) {
+      desc[desc_idx] |= offset_in_bit_location;
+    }
+
+// start_address, bit 0-13, 4LSB not included (so grab bit 4-17)
+// the only bits that is different for each desc of the same obj
+#pragma unroll
+    for (int desc_idx = 0; desc_idx < NUM_DESCRIPTORS; ++desc_idx) {
+      // for fp16, desc_idx_in_128B should range from 0 to 3
+      int desc_idx_in_128B = desc_idx % NUM_DESCRIPTORS_PER_128B_IN_K;
+      int desc_idx_over_128B = desc_idx / NUM_DESCRIPTORS_PER_128B_IN_K;
+
+      uint64_t smem_address_bit_in_bit_location =
+          (smem_address_bit + ((GMMA_K * BITS_PER_ELEMENT) / 8) * desc_idx_in_128B +
+           Cta_tile::M * BYTES_PER_LEADING_DIM * desc_idx_over_128B)
+          << 46;
+
+      smem_address_bit_in_bit_location = smem_address_bit_in_bit_location >> 50;
+      desc[desc_idx] |= smem_address_bit_in_bit_location;
+    }
+  }
+
+  // get a single desc from the desc group.
+  inline __device__ uint64_t get_descriptor(int desc_idx) const {
+    // printf("desc[0] = 0x%lx\n", desc[0]);
+    return desc[(Gmma_vector_size == Gmma_descriptor_size::ALL) ? desc_idx : 0];
+  }
+
+  // get the max descriptor for desc[0]
+  inline __device__ uint64_t get_max_descriptor_0() const { return max_desc_0; }
+
+  // set a single desc from the desc group.
+  inline __device__ void set_descriptor(int desc_idx, uint64_t single_desc) {
+    desc[(Gmma_vector_size == Gmma_descriptor_size::ALL) ? desc_idx : 0] = single_desc;
+  }
+
+  // set the max descriptor for desc[0]. Should be called once from prologue.
+  // Should be called with set_smem_pointer()
+  // This value is needed to "loop back" to the first LDGSTS buffer when appropriate.
+  inline __device__ void set_max_descriptor_0(int mem_offset_no_4LSB) {
+    max_desc_0 = desc[0] + mem_offset_no_4LSB;
+  }
+
+  // for desc group where all desc all allocated,
+  // increment_single_descriptor() will do nothing.
+  inline __device__ void increment_single_descriptor(bool last_of_kblock) {
+    // update smem start address, which is in lower 32bits.
+    int2& tmp = reinterpret_cast<int2&>(desc[0]);
+    if (last_of_kblock == true) {
+      tmp.x -= BYTES_DESC_INC_BOUNDARY_NO_4LSB;
+    } else {
+      tmp.x += BYTES_PER_DESC_NO_4LSB;
+    }
+  }
+
+  template <int BYTE_OFFSET>
+  inline __device__ void increment_single_descriptor() {
+    int2& tmp = reinterpret_cast<int2&>(desc[0]);
+    tmp.x += (BYTE_OFFSET >> 4);
+  }
+
+ private:
+  // the descriptors, each of 64 bit
+  uint64_t desc[NUM_DESCRIPTORS];
+  // the max desc for desc_idx = 0
+  uint64_t max_desc_0;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// for b
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <Gmma_descriptor_transpose Gmma_trans, Gmma_descriptor_mode Gmma_mode, typename Cta_tile,
+          int BITS_PER_ELEMENT, int GMMA_M, int GMMA_N, int GMMA_K,
+          // number of desc actually allocated.
+          Gmma_descriptor_size Gmma_vector_size>
+class Gmma_descriptor_b {
+ public:
+  // The type of the Single Descriptor
+  using Single_desc = Single_descriptor<Gmma_trans, Gmma_mode>;
+
+  // Transpose mode.
+  static constexpr Gmma_descriptor_transpose TRANS_MODE = Gmma_trans;
+
+  // The number of descriptors per 64xNblockxKblock.
+  static constexpr Gmma_descriptor_size GMMA_DESC_SIZE_PER_GROUP = Gmma_vector_size;
+
+  // Currently the number of descriptors per 64xNblockxKblock is always One
+  // Historically we have supported more descriptors. But that has proven to
+  // be less performant as it consumes too many uniform registers.
+  // During the process of refactoring we have decided to only support allocating
+  // one desc per 64xNblockxKblock. If needed in the future, we can support
+  // more desc.
+  static_assert(Gmma_vector_size == Gmma_descriptor_size::ONE,
+                "Currently, only Mblock/64 desc is allocated per kgroup\n");
+
+  // Interleaved Mode is currently not supported.
+  // static_assert to avoid accidentally instantiate it.
+  static_assert(Gmma_mode != Gmma_descriptor_mode::SWIZZLE_NONE,
+                "Currently, SWIZZLE_NONE mode is not implemented. \n");
+
+  // byte per leading dim (column if TN, row if NT), must be 128
+  enum { BYTES_PER_LEADING_DIM = 128 };
+
+  // bytes per element
+  enum { BYTES_PER_ELEMENT = BITS_PER_ELEMENT / 8 };
+
+  // the number of descriptors per kblock is related to GMMA shape and kblock size
+  enum {
+    NUM_DESCRIPTORS = (Gmma_vector_size == Gmma_descriptor_size::ALL) ? Cta_tile::K / GMMA_K : 1
+  };
+
+  // the number of descriptors per 128 byte in k dimension (leading dim)
+  // NUM_DESCRIPTORS_PER_128B_IN_K is really only needed if leading dim is K
+  enum {
+    NUM_DESCRIPTORS_PER_128B_IN_K = (Gmma_mode == Gmma_descriptor_mode::SWIZZLE_128B &&
+                                     Gmma_trans == Gmma_descriptor_transpose::NOTRANS)
+                                        ? BYTES_PER_LEADING_DIM / ((GMMA_K * BITS_PER_ELEMENT) / 8)
+                                        : NUM_DESCRIPTORS
+  };
+
+  static constexpr uint32_t BYTES_PER_GMMA_K = GMMA_K * BITS_PER_ELEMENT / 8;  // 32B
+
+  // the distance between neighboring descriptors
+  static constexpr uint32_t BYTES_PER_DESC =
+      Gmma_vector_size == Gmma_descriptor_size::ALL ? 0
+      : Gmma_trans == Gmma_descriptor_transpose::TRANS
+          ? Gmma_mode == Gmma_descriptor_mode::SWIZZLE_128B  ? GMMA_K * BYTES_PER_LEADING_DIM
+            : Gmma_mode == Gmma_descriptor_mode::SWIZZLE_64B ? (GMMA_K / 2) * BYTES_PER_LEADING_DIM
+            : Gmma_mode == Gmma_descriptor_mode::SWIZZLE_32B ? (GMMA_K / 4) * BYTES_PER_LEADING_DIM
+                                                             : 0
+      : Gmma_trans == Gmma_descriptor_transpose::NOTRANS
+          ? Gmma_mode == Gmma_descriptor_mode::SWIZZLE_128B ||
+                    Gmma_mode == Gmma_descriptor_mode::SWIZZLE_64B
+                ? BYTES_PER_GMMA_K  // 32B
+            : Gmma_mode == Gmma_descriptor_mode::SWIZZLE_32B ? GMMA_N * BYTES_PER_GMMA_K
+                                                             : 0
+          : 0;
+
+  // the distance between neighboring desc without 4LSB
+  static constexpr uint32_t BYTES_PER_DESC_NO_4LSB = BYTES_PER_DESC >> 4;
+
+  // the distance to travel back from the last desc to the first desc within a group
+  enum { BYTES_DESC_INC_BOUNDARY_NO_4LSB = BYTES_PER_DESC_NO_4LSB * (Cta_tile::K / GMMA_K - 1) };
+
+  // Byte count on tile-K dimension
+  enum {
+    RESET_SMEM = ((Gmma_trans == Gmma_descriptor_transpose::NOTRANS) &&
+                  (((Cta_tile::K * BITS_PER_ELEMENT) / (8 * BYTES_PER_LEADING_DIM)) > 1))
+                     ? true
+                     : false
+  };
+
+  // Reset bytes per BYTES_PER_LEADING_DIM (128) x tile-N
+  enum { RESET_BYTES_NO_4LSB = (BYTES_PER_LEADING_DIM * Cta_tile::N) / 16 };
+
+  // set GMMA descriptor mode bits.
+  static constexpr uint64_t DESCRIPTOR_MODE_IN_BIT_LOCATION =
+      (static_cast<uint64_t>(Gmma_mode) & ((1u << GMMA_DESCRIPTOR_MODE_BITS) - 1))
+      << GMMA_DESCRIPTOR_MODE_SHIFT;
+
+  // stride byte offset, bit 32-45, 4LSB not included
+  // each column is always of 128 byte. 8 columns always.
+  // divide by 16 since the 4 LSB is not included
+  static constexpr uint64_t STRIDE_BYTE_OFFSET =
+      BYTES_PER_LEADING_DIM *
+      ((Gmma_mode == Gmma_descriptor_mode::SWIZZLE_128B) ? 8
+       : Gmma_mode == Gmma_descriptor_mode::SWIZZLE_64B  ? 4
+                                                         : 2) /
+      16;
+  // shift 32 bit
+  static constexpr uint64_t STRIDE_BYTE_OFFSET_IN_BIT_LOCATION = STRIDE_BYTE_OFFSET << 32;
+
+  // leading byte offset, bit 16-29, 4LSB not included
+  // each column is still 128 byte.
+  // divide by 16 since the 4 LSB is not included
+  // for B matrix of TN, and the way we reshape the matrix, LEADING_BYTE_OFFSET is never non-zero
+  // in the future with different GMMA shape, this might be needed
+  static constexpr bool LEADING_BYTE_OFFSET_NEEDED =
+      (((GMMA_N * BITS_PER_ELEMENT) / 8 > BYTES_PER_LEADING_DIM &&
+        Gmma_trans == Gmma_descriptor_transpose::TRANS) ||
+       GMMA_K == 64)
+          ? true
+          : false;
+
+  // the leading byte offset if needed 4LSB not included
+  static constexpr uint64_t LEADING_BYTE_OFFSET =
+      GMMA_K == 64
+          ? Cta_tile::N * 32 / 16
+          : (BYTES_PER_LEADING_DIM *
+             ((Gmma_trans == Gmma_descriptor_transpose::TRANS) ? Cta_tile::K : Cta_tile::N) / 16);
+  // shift 16 bit
+  static constexpr uint64_t LEADING_BYTE_OFFSET_IN_BIT_LOCATION =
+      LEADING_BYTE_OFFSET_NEEDED ? LEADING_BYTE_OFFSET << 16 : 0;
+
+  // ctor
+  inline __device__ Gmma_descriptor_b() {
+#pragma unroll
+    for (int desc_idx = 0; desc_idx < NUM_DESCRIPTORS; ++desc_idx) {
+      desc[desc_idx] = 0;
+    }
+
+// set bit 62-63 to 1 for SWIZZLE_128B format
+// set bit 62-63 to 2 for SWIZZLE_64B format
+#pragma unroll
+    for (int desc_idx = 0; desc_idx < NUM_DESCRIPTORS; ++desc_idx) {
+      desc[desc_idx] |= DESCRIPTOR_MODE_IN_BIT_LOCATION;
+    }
+
+// stride byte offset, bit 32-45, 4LSB not included
+#pragma unroll
+    for (int desc_idx = 0; desc_idx < NUM_DESCRIPTORS; ++desc_idx) {
+      desc[desc_idx] |= STRIDE_BYTE_OFFSET_IN_BIT_LOCATION;
+    }
+
+    // leading byte offset, bit 16-29, 4LSB not included
+    if (LEADING_BYTE_OFFSET_NEEDED) {
+#pragma unroll
+      for (int desc_idx = 0; desc_idx < NUM_DESCRIPTORS; ++desc_idx) {
+        desc[desc_idx] |= LEADING_BYTE_OFFSET_IN_BIT_LOCATION;
+      }
+    }
+  }
+
+  // update the descriptor based on smem address. Should be called once from prologue.
+  inline __device__ void set_smem_pointer(uint32_t smem_nvvm_pointer) {
+    // uint64_t smem_address_bit = reinterpret_cast<uint64_t>(smem);
+    // uint32_t smem_nvvm_pointer = get_smem_pointer(smem);
+    uint64_t smem_address_bit = static_cast<uint64_t>(smem_nvvm_pointer);
+
+    // set base offset, bit 49-61
+    uint64_t offset = (smem_address_bit / BYTES_PER_LEADING_DIM) %
+                      ((Gmma_mode == Gmma_descriptor_mode::SWIZZLE_128B) ? 8
+                       : Gmma_mode == Gmma_descriptor_mode::SWIZZLE_64B  ? 4
+                                                                         : 2);
+    uint64_t offset_in_bit_location = offset << 49;
+#pragma unroll
+    for (int desc_idx = 0; desc_idx < NUM_DESCRIPTORS; ++desc_idx) {
+      desc[desc_idx] |= offset_in_bit_location;
+    }
+
+// start_address, bit 0-13, 4LSB not included(so grab bit 4-17)
+// the only bits that is different for each desc of the same obj
+#pragma unroll
+    for (int desc_idx = 0; desc_idx < NUM_DESCRIPTORS; ++desc_idx) {
+      // for fp16, desc_idx_in_128B should range from 0 to 3
+      int desc_idx_in_128B = desc_idx % NUM_DESCRIPTORS_PER_128B_IN_K;
+      int desc_idx_over_128B = desc_idx / NUM_DESCRIPTORS_PER_128B_IN_K;
+
+      uint64_t smem_address_bit_in_bit_location =
+          (smem_address_bit + ((GMMA_K * BITS_PER_ELEMENT) / 8) * desc_idx_in_128B +
+           Cta_tile::N * BYTES_PER_LEADING_DIM * desc_idx_over_128B)
+          << 46;
+      smem_address_bit_in_bit_location = smem_address_bit_in_bit_location >> 50;
+      desc[desc_idx] |= smem_address_bit_in_bit_location;
+    }
+  }
+
+  // get a single desc from the desc group.
+  inline __device__ uint64_t get_descriptor(int desc_idx) const {
+    // if(threadIdx.x == 128)
+    //    printf("desc[0] = 0x%lx\n", desc[0]);
+    //__syncwarp();
+    return desc[(Gmma_vector_size == Gmma_descriptor_size::ALL) ? desc_idx : 0];
+  }
+
+  // get the max descriptor for desc[0]
+  inline __device__ uint64_t get_max_descriptor_0() const { return max_desc_0; }
+
+  // set a single desc from the desc group.
+  inline __device__ void set_descriptor(int desc_idx, uint64_t single_desc) {
+    desc[(Gmma_vector_size == Gmma_descriptor_size::ALL) ? desc_idx : 0] = single_desc;
+  }
+
+  // set the max descriptor for desc[0]. Should be called once from prologue.
+  // Should be called with set_smem_pointer()
+  // This value is needed to "loop back" to the first LDGSTS buffer when appropriate.
+  inline __device__ void set_max_descriptor_0(int mem_offset_no_4LSB) {
+    max_desc_0 = desc[0] + mem_offset_no_4LSB;
+  }
+
+  // for desc group where all desc all allocated,
+  // increment_single_descriptor() will do nothing.
+  inline __device__ void increment_single_descriptor(bool last_of_kblock) {
+    // update smem start address, which is in lower 32bits.
+    int2& tmp = reinterpret_cast<int2&>(desc[0]);
+    if (last_of_kblock == true) {
+      tmp.x -= BYTES_DESC_INC_BOUNDARY_NO_4LSB;
+    } else {
+      tmp.x += BYTES_PER_DESC_NO_4LSB;
+    }
+  }
+
+  template <int BYTE_OFFSET>
+  inline __device__ void increment_single_descriptor() {
+    int2& tmp = reinterpret_cast<int2&>(desc[0]);
+    tmp.x += (BYTE_OFFSET >> 4);
+  }
+
+  // for desc group where all desc all allocated,
+  // increment_single_descriptor() will do nothing.
+  inline __device__ void increment_single_descriptor(bool last_of_kblock, bool switch_kblock) {
+    // update smem start address, which is in lower 32bits.
+    int2& tmp = reinterpret_cast<int2&>(desc[0]);
+    if (RESET_SMEM) {
+      if (switch_kblock) {
+        tmp.x -= BYTES_PER_DESC_NO_4LSB;
+        tmp.x += RESET_BYTES_NO_4LSB;
+      } else {
+        if (last_of_kblock == true) {
+          tmp.x -= BYTES_PER_DESC_NO_4LSB;
+          tmp.x -= RESET_BYTES_NO_4LSB;
+        } else {
+          tmp.x += BYTES_PER_DESC_NO_4LSB;
+        }
+      }
+    } else {
+      if (last_of_kblock == true) {
+        tmp.x -= BYTES_DESC_INC_BOUNDARY_NO_4LSB;
+      } else {
+        tmp.x += BYTES_PER_DESC_NO_4LSB;
+      }
+    }
+  }
+
+ private:
+  // the descriptors, each of 64 bit
+  uint64_t desc[NUM_DESCRIPTORS];
+  // the max desc for desc_idx = 0
+  uint64_t max_desc_0;
+};
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/hopper/kernel_traits.h b/csrc/fmha_v2/fmha/hopper/kernel_traits.h
new file mode 100644
index 0000000000..edeff1e281
--- /dev/null
+++ b/csrc/fmha_v2/fmha/hopper/kernel_traits.h
@@ -0,0 +1,365 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+#include <fmha/gmem_tile_qkv_packed.h>
+#include <fmha/hopper/compute_tile.h>
+#include <fmha/hopper/gmem_tile_o_packed.h>
+#include <fmha/hopper/gmem_tile_qkv_packed.h>
+#include <fmha/hopper/smem_tile.h>
+#include <fmha/hopper/smem_tile_o.h>
+#include <fmha/smem_tile.h>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // Instruction traits.
+    typename Traits_p_,
+    // Instruction traits.
+    typename Traits_o_,
+    // The ldgsts global memory tile for Q, K and V.
+    template <typename, typename, int, int, int, int, bool, bool, int, bool> class Gmem_tile_qkv_,
+    // The tma global memory tile for Q, K and V.
+    template <typename, typename, int, int, int, bool, bool, int> class Gmem_tile_tma_qkv_,
+    // The global memory tile for the output.
+    template <typename, typename, int> class Gmem_tile_o_,
+    // Sequence length.
+    int S,
+    // The hidden dimension.
+    int D,
+    // The iteration step of the outer loop.
+    int STEP,
+    // The number of vertical warps.
+    int WARPS_M,
+    // The number of horizontal warps.
+    int WARPS_N,
+    // The version of the kernel.
+    int VERSION_,
+    // The mask version of the kernel, (2 denotes dense mask, 3 denotes causal mask)
+    int MASK_VERSION_ = 2,
+    // The flags to control the behaviour of LDGs.
+    uint32_t FLAGS = 0x8u>
+struct FMHA_kernel_traits_hopper {
+  // The instruction traits for the Q*K product.
+  using Traits_p = Traits_p_;
+
+  // is Q operand in RF for GMMA?
+  static constexpr bool GMMA_Q_RF = Traits_p::GMMA_A_RF;
+
+  // is K operand in RF for GMMA?
+  static constexpr bool GMMA_K_RF = Traits_p::GMMA_B_RF;
+
+  // The instruction traits for P*V product.
+  using Traits_o = Traits_o_;
+
+  // is S operand in RF for GMMA?
+  static constexpr bool GMMA_S_RF = Traits_o::GMMA_A_RF;
+
+  // is V operand in RF for GMMA?
+  static constexpr bool GMMA_V_RF = Traits_o::GMMA_B_RF;
+
+  // The number of warpgroups along M dimension
+  enum { WARP_GROUP_M = WARPS_M / 4 };
+
+  // The number of warpgroups along N dimension
+  enum { WARP_GROUP_N = WARPS_N };
+
+  // The number of warpgroups along K dimension
+  enum { WARP_GROUP_K = 1 };
+
+  // The CTA description for the 1st GEMM.
+  using Cta_tile_p =
+      typename Traits_p::template Cta_tile<STEP, S, D, WARP_GROUP_M, WARP_GROUP_N, 1>;
+  // The CTA description for the 2nd GEMM.
+  using Cta_tile_o =
+      typename Traits_o::template Cta_tile<STEP, D, S, WARP_GROUP_M, 1, WARP_GROUP_N>;
+
+  // The version.
+  enum { VERSION = VERSION_ };
+
+  enum { MASK_VERSION = MASK_VERSION_ };
+
+  // Whether use causal mask or not.
+  enum { CAUSAL_MASK = MASK_VERSION_ >= 3 };
+
+  // Whether use the sliding window attention mask or not.
+  enum { SLIDING_WINDOW_ATTENTION = MASK_VERSION_ == 4 };
+
+  // Do we use LDGSTS for Q, K or V. If not, TMA is used!
+  enum { USE_LDGSTS_Q = (FLAGS & 0x1u) != 0u };
+
+  enum { USE_LDGSTS_K = (FLAGS & 0x2u) != 0u };
+
+  enum { USE_LDGSTS_V = (FLAGS & 0x4u) != 0u };
+
+  enum { USE_TMA_Q = !USE_LDGSTS_Q };
+
+  enum { USE_TMA_K = !USE_LDGSTS_K };
+
+  enum { USE_TMA_V = !USE_LDGSTS_V };
+
+  // Do we use one buffer for K and V.
+  enum { SHARE_SMEM_FOR_K_AND_V = 0 };
+
+  // Do we use the scale max trick.
+  enum { USE_SCALE_MAX = 0 };
+
+  // Are heads in QKV interleaved, i.e. total x h x 3 x d or total x 3 x h x d.
+  enum { HEADS_INTERLEAVED = (FLAGS & 0x20u) == 0u };
+
+  // Use BMM1 softcapping scale or not.
+  enum { ENABLE_BMM1_SOFTCAPPING_SCALE = (FLAGS & 0x800) != 0u };
+
+  // Number of matrix for gmem_tile_qkv
+  enum { NUM_QKV_MATS = 3 };
+
+  // The global memory tile to load Q.
+  // Hopefully we don't need to specialize for Hopper.
+  using Gmem_tile_ldgsts_q =
+      Gmem_tile_qkv_<Traits_p, Cta_tile_p, Traits_p::BITS_PER_ELEMENT_A, STEP, D, D, true,
+                     HEADS_INTERLEAVED, NUM_QKV_MATS, SLIDING_WINDOW_ATTENTION>;
+
+  // The global memory tile to load Q with TMA.
+  using Gmem_tile_tma_q = Gmem_tile_tma_qkv_<Traits_p, Cta_tile_p, Traits_p::BITS_PER_ELEMENT_A,
+                                             STEP, D, false, HEADS_INTERLEAVED, NUM_QKV_MATS>;
+
+  // Do we use ldgsts gmem tile or tma gmem tile?
+  using Gmem_tile_q =
+      typename std::conditional_t<USE_LDGSTS_Q, Gmem_tile_ldgsts_q, Gmem_tile_tma_q>;
+
+  // 2 buffers for Q
+  enum { BUFFERS_PER_SMEM_TILE_Q = 2 };
+
+  // Q is row major
+  using Q_layout = fmha::Row;
+
+  // We know Q is row-major. So we can also deduce the descriptor mode.
+  static constexpr fmha::Gmma_descriptor_mode GMMA_DESC_MODE_Q =
+      Cta_tile_p::K * sizeof(typename Traits_p::A_type) >= 128
+          ? fmha::Gmma_descriptor_mode::SWIZZLE_128B
+          : fmha::Gmma_descriptor_mode::SWIZZLE_64B;
+
+  // The shared memory tile to swizzle Q.
+  using Smem_tile_ldgsts_q =
+      fmha::Smem_tile_hopper_a<Traits_p, Cta_tile_p, Q_layout, 16, BUFFERS_PER_SMEM_TILE_Q,
+                               GMMA_DESC_MODE_Q, USE_TMA_Q, GMMA_Q_RF>;
+
+  // The shared memory tile to swizzle Q. TODO: need to update to XMMA.
+  using Smem_tile_tma_q =
+      fmha::wip::Smem_tile_hopper_a<Traits_p, Cta_tile_p, Q_layout, BUFFERS_PER_SMEM_TILE_Q,
+                                    GMMA_DESC_MODE_Q, GMMA_Q_RF, USE_LDGSTS_Q>;
+
+  using Smem_tile_q =
+      typename std::conditional_t<USE_LDGSTS_Q, Smem_tile_ldgsts_q, Smem_tile_tma_q>;
+
+  // The global memory tile to load K.
+  // Hopefully we don't need to specialize for hopper.
+  using Gmem_tile_ldgsts_k =
+      Gmem_tile_qkv_<Traits_p, Cta_tile_p, Traits_p::BITS_PER_ELEMENT_B, S, D, D,
+                     true,  // use ldgsts
+                     HEADS_INTERLEAVED, NUM_QKV_MATS, SLIDING_WINDOW_ATTENTION>;
+
+  // The global memory tile to load K with TMA.
+  using Gmem_tile_tma_k = Gmem_tile_tma_qkv_<Traits_p, Cta_tile_p, Traits_p::BITS_PER_ELEMENT_B, S,
+                                             D, false, HEADS_INTERLEAVED, NUM_QKV_MATS>;
+
+  // Do we use ldgsts gmem tile or tma gmem tile?
+  using Gmem_tile_k =
+      typename std::conditional_t<USE_LDGSTS_K, Gmem_tile_ldgsts_k, Gmem_tile_tma_k>;
+
+  // 1 buffers for K
+  enum { BUFFERS_PER_SMEM_TILE_K = 1 };
+
+  // K is column major
+  using K_layout = fmha::Col;
+
+  // We know K is column-major. So we can also deduce the descriptor mode.
+  static constexpr fmha::Gmma_descriptor_mode GMMA_DESC_MODE_K =
+      Cta_tile_p::K * sizeof(typename Traits_p::B_type) >= 128
+          ? fmha::Gmma_descriptor_mode::SWIZZLE_128B
+          : fmha::Gmma_descriptor_mode::SWIZZLE_64B;
+
+  // The shared memory tile to swizzle K.
+  using Smem_tile_ldgsts_k =
+      fmha::Smem_tile_hopper_b<Traits_p, Cta_tile_p, K_layout, 16, BUFFERS_PER_SMEM_TILE_K,
+                               GMMA_DESC_MODE_K, USE_TMA_K>;
+
+  using Smem_tile_tma_k =
+      fmha::wip::Smem_tile_hopper_b<Traits_p, Cta_tile_p, K_layout, BUFFERS_PER_SMEM_TILE_K,
+                                    GMMA_DESC_MODE_K, GMMA_K_RF, USE_LDGSTS_K>;
+
+  using Smem_tile_k =
+      typename std::conditional_t<USE_LDGSTS_K, Smem_tile_ldgsts_k, Smem_tile_tma_k>;
+
+  // The global memory tile to load V.
+  using Gmem_tile_ldgsts_v =
+      Gmem_tile_qkv_<Traits_o, Cta_tile_o, Traits_o::BITS_PER_ELEMENT_B, S, D, D,
+                     true,  // use ldgsts
+                     HEADS_INTERLEAVED, NUM_QKV_MATS, SLIDING_WINDOW_ATTENTION>;
+
+  // The global memory tile to load V with TMA.
+  using Gmem_tile_tma_v = Gmem_tile_tma_qkv_<Traits_o, Cta_tile_o, Traits_o::BITS_PER_ELEMENT_B, S,
+                                             D, false, HEADS_INTERLEAVED, NUM_QKV_MATS>;
+
+  // Do we use ldgsts gmem tile or tma gmem tile?
+  using Gmem_tile_v =
+      typename std::conditional_t<USE_LDGSTS_V, Gmem_tile_ldgsts_v, Gmem_tile_tma_v>;
+
+  // 1 buffers for V
+  enum { BUFFERS_PER_SMEM_TILE_V = 1 };
+
+  // V is row major
+  using V_layout = fmha::Row;
+
+  // We know V is row marjor. So we can also deduce the descriptor mode.
+  static constexpr fmha::Gmma_descriptor_mode GMMA_DESC_MODE_V =
+      Cta_tile_o::N * sizeof(typename Traits_o::B_type) >= 128
+          ? fmha::Gmma_descriptor_mode::SWIZZLE_128B
+          : fmha::Gmma_descriptor_mode::SWIZZLE_64B;
+
+  // The shared memory tile to swizzle V.
+  using Smem_tile_ldgsts_v = fmha::Smem_tile_v<Traits_o, Cta_tile_o,
+                                               1,  // BUFFERS_PER_TILE
+                                               GMMA_DESC_MODE_V,
+                                               false  // USE_TMA_V
+                                               >;
+
+  using Smem_tile_tma_v =
+      fmha::wip::Smem_tile_hopper_b<Traits_o, Cta_tile_o, V_layout, BUFFERS_PER_SMEM_TILE_V,
+                                    GMMA_DESC_MODE_V, GMMA_V_RF, USE_LDGSTS_V>;
+
+  using Smem_tile_v =
+      typename std::conditional_t<USE_LDGSTS_V, Smem_tile_ldgsts_v, Smem_tile_tma_v>;
+
+  // The global memory tile to store O.
+  // using Gmem_tile_o = fmha::Gmem_tile_o_hopper<Traits_o, Cta_tile_o>;
+  using Gmem_tile_o = fmha::v2::Gmem_tile_o<Traits_o, Cta_tile_o, 1>;
+
+  using Smem_tile_o_ = fmha::Smem_tile_o<Traits_o, Cta_tile_o>;
+  static constexpr bool NEEDS_SPLIT_K = WARPS_N > 1;
+  using Smem_tile_o =
+      typename std::conditional_t<NEEDS_SPLIT_K, Smem_tile_o_, fmha::Smem_tile_o_dummy>;
+
+  // The amount of shared memory needed to load Q and K.
+  enum { BYTES_PER_SMEM_QK = Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE };
+
+  // The extra amount of shared memory needed to load V.
+  enum { BYTES_PER_SMEM_V = SHARE_SMEM_FOR_K_AND_V ? 0u : Smem_tile_v::BYTES_PER_TILE };
+
+  // The amount of shared memory needed for Q, K and V..
+  enum { BYTES_PER_SMEM_QKV = BYTES_PER_SMEM_QK + BYTES_PER_SMEM_V };
+
+  // The amount of shared memory needed to load Q and store O.
+  // enum { BYTES_PER_SMEM_QO = Smem_tile_q::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE };
+  // For now let's pretend no smem for O matrix. [Timmy]
+  enum { BYTES_PER_SMEM_QO = Smem_tile_q::BYTES_PER_TILE };
+
+  // The amount of over allocated smem to guarantee 1024B alignment.
+  enum { BYTES_FOR_ALIGNMENT = 1024 };
+
+  // The size in bytes for each SMEM barrier
+  enum { BYTES_PER_SMEM_BARRIER = 8 };
+
+  // The amount of smem used by smem barrier. Only needed if TMA is used.
+  enum {
+    BYTES_FOR_SMEM_BARRIER_Q =
+        USE_LDGSTS_Q == 1 ? 0 : BUFFERS_PER_SMEM_TILE_Q * BYTES_PER_SMEM_BARRIER
+  };
+
+  // The amount of smem used by smem barrier. Only needed if TMA is used.
+  // each smem barrier is 8 bytes, each buffer has 2 barriers
+  enum {
+    BYTES_FOR_SMEM_BARRIER_K =
+        USE_LDGSTS_K == 1 ? 0 : BUFFERS_PER_SMEM_TILE_K * BYTES_PER_SMEM_BARRIER
+  };
+
+  // The amount of smem used by smem barrier. Only needed if TMA is used.
+  // Currently, K and V can share the same barrier.
+  enum { BYTES_FOR_SMEM_BARRIER_V = 0 };
+
+  // The amount of smem used by smem barrier. Only needed if TMA is used.
+  enum {
+    BYTES_FOR_SMEM_BARRIER =
+        BYTES_FOR_SMEM_BARRIER_Q + BYTES_FOR_SMEM_BARRIER_K + BYTES_FOR_SMEM_BARRIER_V
+  };
+
+  // TODO move those
+  enum { BYTES_FOR_SOFTMAX = WARPS_N == 1 ? 0 : sizeof(float) * WARPS_N * 64 };
+
+  enum {
+    BYTES_PER_SMEM_O =
+        WARPS_N == 1 ? 0 : WARPS_N * 64 * D * sizeof(typename Traits_o::Epilogue_type)
+  };
+
+  static_assert(Smem_tile_o::BYTES_PER_TILE == (int)BYTES_PER_SMEM_O);
+
+  // The amount of shared memory needed for Q, K, V and O.
+  // TODO double check.
+  // - For GMMA QKV are always stored in SMEM.
+  // - Cannot share SMEM K/V
+  // - O needs to be separate
+  // enum { BYTES_PER_SMEM = fmha::Max<BYTES_PER_SMEM_QKV, BYTES_PER_SMEM_QO>::VALUE
+  enum {
+    BYTES_PER_SMEM = BYTES_PER_SMEM_QKV + BYTES_PER_SMEM_O + BYTES_FOR_SOFTMAX +
+                     BYTES_FOR_SMEM_BARRIER + BYTES_FOR_ALIGNMENT
+  };
+
+  // The number of threads.
+  enum { THREADS = Cta_tile_p::THREADS_PER_CTA };
+
+  // Make sure the number of threads matches both CTAs.
+  static_assert((int)THREADS == (int)Cta_tile_o::THREADS_PER_CTA, "");
+
+  // The compute tile for P = Q*K.
+  using Compute_tile_p =
+      fmha::Compute_tile_with_gmma<Traits_p, Cta_tile_p, Smem_tile_q, Smem_tile_k,
+                                   Traits_p::GMMA_A_RF, Traits_p::GMMA_B_RF>;
+  // The compute tile for O = S*V.
+  using Compute_tile_o =
+      fmha::Compute_tile_with_gmma<Traits_o,
+                                   // TODO TMA path?
+                                   Cta_tile_o,
+                                   // typename Smem_tile_v::Cta_tile_gmma,
+                                   Smem_tile_q,  // we don't need to pass smem_tile here? Not
+                                                 // really.
+                                   Smem_tile_v, Traits_o::GMMA_A_RF, Traits_o::GMMA_B_RF>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The BMM1 instruction traits.
+    typename Traits_p,
+    // The BMM2 instruction traits.
+    typename Traits_o,
+    // The sequence length.
+    int S,
+    // The hidden size per head.
+    int D,
+    // The number of timesteps per iteration of the main loop.
+    int STEP,
+    // The number of vertical warps.
+    int WARPS_M,
+    // The number of horizontal warps.
+    int WARPS_N,
+    // The attention mask type (2 denotes dense mask, 3 denotes causal mask).
+    int MASK_VERSION,
+    // The flags.
+    uint32_t FLAGS = 0x8>
+using FMHA_kernel_traits_hopper_v2 =
+    FMHA_kernel_traits_hopper<Traits_p, Traits_o,
+                              fmha::v2::Gmem_tile_qkv,  // hopefully we don't need to specialize for
+                                                        // hopper ldgsts
+                              fmha::v2::Gmem_tile_tma_qkv, fmha::v2::Gmem_tile_o, S, D, STEP,
+                              WARPS_M, WARPS_N, 2, MASK_VERSION, FLAGS>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/fmha_v2/fmha/hopper/smem_tile.h b/csrc/fmha_v2/fmha/hopper/smem_tile.h
new file mode 100644
index 0000000000..b921b48db2
--- /dev/null
+++ b/csrc/fmha_v2/fmha/hopper/smem_tile.h
@@ -0,0 +1,2423 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/fragment.h>
+#include <fmha/hopper/gmma_descriptor.h>
+#include <fmha/hopper/tma_types.h>
+#include <fmha/smem_tile_v.h>
+#include <fmha/traits.h>
+#include <fmha/utils.h>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//
+/// @brief Interface to Smem tiles for a operator
+//  HGMMA
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class Gmma_fusion_mode { NO_FUSION, BN_APPLY };
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace wip {
+
+template <typename Traits, typename Cta_tile, typename Layout, int BUFFERS_PER_TILE = 1,
+          fmha::Gmma_descriptor_mode desc_mode = fmha::Gmma_descriptor_mode::SWIZZLE_128B,
+          bool GMMA_A_RF = Traits::GMMA_A_RF,
+          // if USE_LDGSTS is false, TMA will be used.
+          bool USE_LDGSTS = true>
+struct Smem_tile_hopper_a {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, typename Layout, int BUFFERS_PER_TILE = 1,
+          fmha::Gmma_descriptor_mode desc_mode = fmha::Gmma_descriptor_mode::SWIZZLE_128B,
+          bool GMMA_B_RF = Traits::GMMA_B_RF,
+          // if USE_LDGSTS is false, TMA will be used.
+          bool USE_LDGSTS = true>
+struct Smem_tile_hopper_b {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// A Col Major. For GMMA, A is from SMEM directly.
+// Not implemented, since it is not really needed at the moment.
+template <typename Traits, typename Cta_tile, int BUFFERS_PER_TILE_,
+          fmha::Gmma_descriptor_mode desc_mode>
+struct Smem_tile_hopper_gmma_col_a {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// A Row Major. For GMMA, A is from SMEM directly.
+template <typename Traits, typename Cta_tile, int BUFFERS_PER_TILE_,
+          fmha::Gmma_descriptor_mode desc_mode>
+struct Smem_tile_hopper_gmma_row_a {
+  // Currently Interleaved Mode is not implemented.
+  static_assert(desc_mode != fmha::Gmma_descriptor_mode::SWIZZLE_NONE,
+                "Currently, SWIZZLE_NONE Mode is not implemented.\n");
+
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of desc within a gmma group (kblock limited).
+  static constexpr fmha::Gmma_descriptor_size GMMA_DESC_SIZE_PER_GROUP =
+      fmha::Gmma_descriptor_size::ONE;
+
+  // The SWIZZLE_128B descriptor.
+  using Gmma_descriptor =
+      fmha::Gmma_descriptor_a<fmha::Gmma_descriptor_transpose::NOTRANS, desc_mode, Cta_tile,
+                              Traits::BITS_PER_ELEMENT_A, Traits::GMMA_M, Traits::GMMA_N,
+                              Traits::GMMA_K, GMMA_DESC_SIZE_PER_GROUP>;
+
+  using Cta_tile_gmma = Cta_tile;
+
+  // the size in bits of each element.
+  enum { BITS_PER_ELEMENT = Traits::BITS_PER_ELEMENT_A };
+
+  // the size of bytes of each element.
+  enum { BYTES_PER_ELEMENT = BITS_PER_ELEMENT / 8 };
+
+  // The size in bytes of a single LDGSTS/STS.
+  enum { BYTES_PER_STS = 16 };
+
+  // The number of elements per LDGSTS/STS.
+  enum { ELEMENTS_PER_STS = BYTES_PER_STS * 8 / BITS_PER_ELEMENT };
+
+  // SMEM layout for GMMA has a leading dim of exact 128 Byte, at least for SWIZZLE_128B
+  // and SWIZZLE_64B format.
+  enum { BYTES_PER_ROW = 128 };
+
+  // the number of rows per one row of K due the the limitation of leading dim size.
+  enum { NUM_ROWS_PER_K = (Cta_tile::K * BYTES_PER_ELEMENT + BYTES_PER_ROW - 1) / BYTES_PER_ROW };
+
+  static_assert(desc_mode != fmha::Gmma_descriptor_mode::SWIZZLE_64B ||
+                    (Cta_tile::K * BYTES_PER_ELEMENT) == 64,
+                "swizzle_64B row_a is valid if kblock=32\n");
+
+  // Number of SMEM rows.
+  enum {
+    NUM_ROWS = (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B)
+                   ? (Cta_tile::M * NUM_ROWS_PER_K)
+                   : (Cta_tile::M / 2)
+  };
+
+  // The size of one buffer in bytes in shared memory.
+  enum { BYTES_PER_BUFFER = NUM_ROWS * BYTES_PER_ROW };
+
+  // the size of one buffer in bytes in shared memory, without the 4 LSB.
+  // this is needed to increment the GMMA desc to the next buffer.
+  enum { BYTES_PER_BUFFER_NO_4LSB = BYTES_PER_BUFFER / 16 };
+
+  // this is needed to decrement GMMA desc.
+  enum {
+    BYTES_PER_BUFFER_INC_BOUNDARY_NO_4LSB =
+        BYTES_PER_BUFFER_NO_4LSB * BUFFERS_PER_TILE_ - BYTES_PER_BUFFER_NO_4LSB
+  };
+
+  // The number of buffers.
+  enum { BUFFERS_PER_TILE = BUFFERS_PER_TILE_ };
+
+  // The size in bytes of total buffers.
+  enum { BYTES_PER_TILE = BYTES_PER_BUFFER * BUFFERS_PER_TILE };
+
+  // The boundary for smem_read_offset and smem_write_offset increment.
+  enum { BYTES_PER_TILE_INC_BOUNDARY = BYTES_PER_TILE - BYTES_PER_BUFFER };
+
+  // The number of threads needed to store a row
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_STS };
+
+  // The number of rows written with a single STS.
+  enum { ROWS_PER_STS = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // for swizzle_128B the xor factor is 8
+  enum { ROWS_PER_XOR_PATTERN = (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B) ? 8 : 4 };
+
+  // The distance in byte between different GMMA groups (might need multiple due to cta tile size)
+  // each GMMA group is of size GMMA_M x GMMA_N x Kblock
+  enum {
+    GMMA_GROUP_SMEM_DISTANCE = Mma_tile::M_PER_GMMA_GROUP /
+                               (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B ? 1 : 2) *
+                               BYTES_PER_ROW
+  };
+
+  // The number of STS per row.
+  enum { STS_PER_ROW = BYTES_PER_ROW / THREADS_PER_ROW / BYTES_PER_STS };
+
+  // For Hopper, STS_PER_ROW should be 1 (at least for now.)
+  static_assert(STS_PER_ROW == 1, "");
+
+  // Ctor.
+  inline __device__ Smem_tile_hopper_gmma_row_a(char* smem, int tidx)
+      : smem_(__nvvm_get_smem_pointer(smem)) {
+    int smem_write_row = tidx / THREADS_PER_ROW;
+    int smem_write_xor = smem_write_row % ROWS_PER_XOR_PATTERN;
+    int smem_write_col = 0;
+
+    if (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B) {
+      smem_write_col = (tidx % THREADS_PER_ROW) ^ smem_write_xor;
+    } else if (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_64B) {
+      smem_write_col = (tidx % (THREADS_PER_ROW / 2)) ^
+                       smem_write_xor + ((tidx % THREADS_PER_ROW) / (THREADS_PER_ROW / 2)) * 4;
+    }
+
+    this->smem_write_offset_ = smem_write_row * BYTES_PER_ROW + smem_write_col * BYTES_PER_STS;
+
+    // That code is expected to trigger the utilization of the URF by the compiler.
+    this->smem_read_buffer_ = __shfl_sync(0xffffffff, 0, 0);
+    this->smem_write_buffer_ = __shfl_sync(0xffffffff, 0, 0);
+  }
+
+  // Compute the store pointers.
+  template <int N>
+  inline __device__ void compute_store_pointers(uint32_t (&ptrs)[N]) {
+#pragma unroll
+    for (int ii = 0; ii < N; ++ii) {
+      // Decompose the STS into row/col.
+      int row = ii / STS_PER_ROW;
+      // Assemble the offset.
+      int offset = smem_write_offset_ + row * ROWS_PER_STS * BYTES_PER_ROW;
+      // Assemble the final pointer :)
+      ptrs[ii] = smem_ + offset + smem_write_buffer_;
+    }
+  }
+
+  // Store the tile in the shared memory.
+  template <int N, int M>
+  inline __device__ void store(void const* (&gmem_ptrs)[N], uint32_t (&preds)[M], uint64_t = 0) {
+    uint32_t smem_ptrs[N];
+    this->compute_store_pointers<N>(smem_ptrs);
+    ldgsts<N, M>(smem_ptrs, gmem_ptrs, preds);
+  }
+
+  // Move the write offset to next buffer.
+  inline __device__ void move_next_write_buffer() {
+    if (BUFFERS_PER_TILE > 1) {
+      this->smem_write_offset_ += (smem_write_offset_ >= BYTES_PER_TILE_INC_BOUNDARY)
+                                      ? -BYTES_PER_TILE_INC_BOUNDARY
+                                      : BYTES_PER_BUFFER;
+    }
+  }
+
+  inline __device__ void move_next_write_buffer(int) {}
+
+  // Move the read offset to next buffer.
+  // do nothing, as it is controlled by gmma desc
+  inline __device__ void move_next_read_buffer() {}
+
+  // The shared memory pointer.
+  uint32_t smem_;
+  // The read offset. Reserve 4 offsets if needed.
+  int smem_read_offset_;
+  // The write offset.
+  int smem_write_offset_;
+  // The buffer base offset for read.
+  int smem_read_buffer_;
+  // The buffer base offset for write.
+  int smem_write_buffer_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// B Col Major. For GMMA, B is from SMEM directly.
+template <typename Traits, typename Cta_tile, int BUFFERS_PER_TILE_,
+          fmha::Gmma_descriptor_mode desc_mode>
+struct Smem_tile_hopper_gmma_col_b {
+  // Currently Interleaved Mode is not implemented.
+  static_assert(desc_mode != fmha::Gmma_descriptor_mode::SWIZZLE_NONE,
+                "Currently, Interleaved Mode is not implemented.\n");
+
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of desc within a gmma group (kblock limited)
+  static constexpr fmha::Gmma_descriptor_size GMMA_DESC_SIZE_PER_GROUP =
+      fmha::Gmma_descriptor_size::ONE;
+
+  // The SWIZZLE_128B descriptor
+  using Gmma_descriptor =
+      fmha::Gmma_descriptor_b<fmha::Gmma_descriptor_transpose::NOTRANS, desc_mode, Cta_tile,
+                              Traits::BITS_PER_ELEMENT_B, Traits::GMMA_M, Traits::GMMA_N,
+                              Traits::GMMA_K, GMMA_DESC_SIZE_PER_GROUP>;
+
+  using Cta_tile_gmma = Cta_tile;
+
+  // the size in bits of each element.
+  enum { BITS_PER_ELEMENT = Traits::BITS_PER_ELEMENT_B };
+
+  // the size of bytes of each element.
+  enum { BYTES_PER_ELEMENT = BITS_PER_ELEMENT / 8 };
+
+  // The size in bytes of a single LDGSTS/STS.
+  enum { BYTES_PER_STS = 16 };
+
+  // The number of elements per LDGSTS/STS.
+  enum { ELEMENTS_PER_STS = BYTES_PER_STS * 8 / BITS_PER_ELEMENT };
+
+  // SMEM layout for GMMA has a leading dim of exact 128 Byte, at least for SWIZZLE_128B and
+  // SWIZZLE_64B format
+  enum { BYTES_PER_COLUMN = 128 };
+
+  static_assert(desc_mode != fmha::Gmma_descriptor_mode::SWIZZLE_64B ||
+                    (Cta_tile::K * BYTES_PER_ELEMENT) == 64,
+                "swizzle_64B col_b is valid if kblock=32\n");
+
+  // the number of columns per one column of K due the the limitation of leading dim size
+  enum {
+    NUM_COLS_PER_K = (Cta_tile::K * BYTES_PER_ELEMENT + BYTES_PER_COLUMN - 1) / BYTES_PER_COLUMN
+  };
+
+  // Number of SMEM columns.
+  enum {
+    NUM_COLUMNS = (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B)
+                      ? Cta_tile::N * NUM_COLS_PER_K
+                      : Cta_tile::N / 2
+  };
+
+  // The size of one buffer in bytes in shared memory.
+  enum { BYTES_PER_BUFFER = NUM_COLUMNS * BYTES_PER_COLUMN };
+
+  // the size of one buffer in bytes in shared memory, without the 4 LSB.
+  // this is needed to increment the GMMA desc to the next buffer
+  enum { BYTES_PER_BUFFER_NO_4LSB = BYTES_PER_BUFFER / 16 };
+
+  // this is needed to decrement GMMA desc.
+  enum {
+    BYTES_PER_BUFFER_INC_BOUNDARY_NO_4LSB =
+        BYTES_PER_BUFFER_NO_4LSB * BUFFERS_PER_TILE_ - BYTES_PER_BUFFER_NO_4LSB
+  };
+
+  // The number of buffers.
+  enum { BUFFERS_PER_TILE = BUFFERS_PER_TILE_ };
+
+  // The size in bytes of total buffers.
+  enum { BYTES_PER_TILE = BYTES_PER_BUFFER * BUFFERS_PER_TILE };
+
+  // The boundary for smem_read_offset and smem_write_offset increment.
+  enum { BYTES_PER_TILE_INC_BOUNDARY = BYTES_PER_TILE - BYTES_PER_BUFFER };
+
+  // The number of threads needed to store a column.
+  enum { THREADS_PER_COLUMN = BYTES_PER_COLUMN / BYTES_PER_STS };
+
+  // The number of columns written with a single STS.
+  enum { COLUMNS_PER_STS = Cta_tile::THREADS_PER_CTA / THREADS_PER_COLUMN };
+
+  // for swizzle_128B the xor factor is 8.
+  enum {
+    COLUMNS_PER_XOR_PATTERN = (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B) ? 8 : 4
+  };
+
+  // The distance in byte between different GMMA groups (might need multiple due to cta tile size)
+  // each GMMA group is of size GMMA_M x GMMA_N x Kblock
+  enum {
+    GMMA_GROUP_SMEM_DISTANCE = Mma_tile::N_PER_GMMA_GROUP /
+                               (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B ? 1 : 2) *
+                               BYTES_PER_COLUMN
+  };
+
+  // The number of STS per column.
+  enum { STS_PER_COLUMN = BYTES_PER_COLUMN / THREADS_PER_COLUMN / BYTES_PER_STS };
+
+  // For Hopper, STS_PER_COLUMN should be 1 (at least for now.)
+  static_assert(STS_PER_COLUMN == 1, "");
+
+  // Ctor.
+  inline __device__ Smem_tile_hopper_gmma_col_b(char* smem, int tidx)
+      : smem_(__nvvm_get_smem_pointer(smem)) {
+    int smem_write_col = tidx / THREADS_PER_COLUMN;
+    int smem_write_xor = smem_write_col % COLUMNS_PER_XOR_PATTERN;
+    int smem_write_row = 0;
+
+    if (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B) {
+      smem_write_row = (tidx % THREADS_PER_COLUMN) ^ smem_write_xor;
+    } else if (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_64B) {
+      smem_write_row =
+          (tidx % (THREADS_PER_COLUMN / 2)) ^
+          smem_write_xor + ((tidx % THREADS_PER_COLUMN) / (THREADS_PER_COLUMN / 2)) * 4;
+    }
+
+    this->smem_write_offset_ = smem_write_col * BYTES_PER_COLUMN + smem_write_row * BYTES_PER_STS;
+    // That code is expected to trigger the utilization of the URF by the compiler.
+    this->smem_read_buffer_ = __shfl_sync(0xffffffff, 0, 0);
+    this->smem_write_buffer_ = __shfl_sync(0xffffffff, 0, 0);
+  }
+
+  // Compute the store pointers.
+  template <int N>
+  inline __device__ void compute_store_pointers(uint32_t (&ptrs)[N]) {
+#pragma unroll
+    for (int ii = 0; ii < N; ++ii) {
+      // Decompose the STS into row/col.
+      int col = ii / STS_PER_COLUMN;
+      // Assemble the offset.
+      int offset = smem_write_offset_ + col * COLUMNS_PER_STS * BYTES_PER_COLUMN;
+      // Assemble the final pointer :)
+      ptrs[ii] = smem_ + offset + smem_write_buffer_;
+    }
+  }
+
+  // Store the tile in the shared memory.
+  template <int N, int M>
+  inline __device__ void store(void const* (&gmem_ptrs)[N], uint32_t (&preds)[M], uint64_t = 0) {
+    uint32_t smem_ptrs[N];
+    this->compute_store_pointers<N>(smem_ptrs);
+    ldgsts<N, M>(smem_ptrs, gmem_ptrs, preds);
+  }
+
+  // Move the write offset to next buffer.
+  inline __device__ void move_next_write_buffer() {
+    // if( BUFFERS_PER_TILE > 1 ) {
+    //     this->smem_write_offset_ += ( smem_write_offset_ >= BYTES_PER_TILE_INC_BOUNDARY )
+    //                                     ? -BYTES_PER_TILE_INC_BOUNDARY
+    //                                     : BYTES_PER_BUFFER;
+    // }
+  }
+
+  inline __device__ void move_next_write_buffer(int) {}
+
+  // Move the read offset to next buffer.
+  // do nothing, as it is controlled by gmma desc
+  inline __device__ void move_next_read_buffer() {}
+
+  // The shared memory pointer.
+  uint32_t smem_;
+  // The read offset. Reserve 4 offsets if needed.
+  int smem_read_offset_;
+  // The write offset.
+  int smem_write_offset_;
+  // The buffer base offset for read.
+  int smem_read_buffer_;
+  // The buffer base offset for write.
+  int smem_write_buffer_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// B Row Major. For GMMA, B is from SMEM directly.
+template <typename Traits, typename Cta_tile, int BUFFERS_PER_TILE_,
+          fmha::Gmma_descriptor_mode desc_mode>
+struct Smem_tile_hopper_gmma_row_b {
+  // Currently Interleaved Mode is not implemented.
+  static_assert(desc_mode != fmha::Gmma_descriptor_mode::SWIZZLE_NONE,
+                "Currently, Interleaved Mode is not implemented.\n");
+
+  // For SWIZZLE_64B, row b is not needed/implemented
+  static_assert(desc_mode != fmha::Gmma_descriptor_mode::SWIZZLE_64B,
+                "Currently, for SWIZZLE_64B mode, row_b is not needed/implemented. \n");
+
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of desc within a gmma group (kblock limited)
+  static constexpr fmha::Gmma_descriptor_size GMMA_DESC_SIZE_PER_GROUP =
+      fmha::Gmma_descriptor_size::ONE;
+
+  // The SWIZZLE_128B descriptor
+  using Gmma_descriptor =
+      fmha::Gmma_descriptor_b<fmha::Gmma_descriptor_transpose::TRANS, desc_mode, Cta_tile,
+                              Traits::BITS_PER_ELEMENT_B, Traits::GMMA_M, Traits::GMMA_N,
+                              Traits::GMMA_K, GMMA_DESC_SIZE_PER_GROUP>;
+
+  using Cta_tile_gmma = Cta_tile;
+
+  // the size in bits of each element.
+  enum { BITS_PER_ELEMENT = Traits::BITS_PER_ELEMENT_B };
+
+  // the size of bytes of each element.
+  enum { BYTES_PER_ELEMENT = BITS_PER_ELEMENT / 8 };
+
+  // The size in bytes of a single LDGSTS/STS.
+  enum { BYTES_PER_STS = 16 };
+
+  // The number of elements per LDGSTS/STS.
+  enum { ELEMENTS_PER_STS = BYTES_PER_STS * 8 / BITS_PER_ELEMENT };
+
+  // SMEM layout for GMMA has a leading dim of exact 128 Byte, at least for SWIZZLE_128B and
+  // SWIZZLE_64B format
+  enum { BYTES_PER_ROW = 128 };
+
+  // the number of rows per one row of N due the the limitation of leading dim size
+  enum { NUM_ROWS_PER_N = (Cta_tile::N * BYTES_PER_ELEMENT + BYTES_PER_ROW - 1) / BYTES_PER_ROW };
+
+  // the number of rows per one row of N_PER_GMMA_GROUP
+  enum {
+    NUM_ROWS_PER_GMMA_GROUP_N =
+        (Mma_tile::N_PER_GMMA_GROUP * BYTES_PER_ELEMENT + BYTES_PER_ROW - 1) / BYTES_PER_ROW
+  };
+
+  // Number of SMEM rows
+  enum { NUM_ROWS = Cta_tile::K * NUM_ROWS_PER_N };
+
+  // The size of one buffer in bytes in shared memory.
+  enum { BYTES_PER_BUFFER = NUM_ROWS * BYTES_PER_ROW };
+
+  // the size of one buffer in bytes in shared memory, without the 4 LSB.
+  // this is needed to increment the GMMA desc to the next buffer
+  enum { BYTES_PER_BUFFER_NO_4LSB = BYTES_PER_BUFFER / 16 };
+
+  // this is needed to decrement GMMA desc
+  enum {
+    BYTES_PER_BUFFER_INC_BOUNDARY_NO_4LSB =
+        BYTES_PER_BUFFER_NO_4LSB * BUFFERS_PER_TILE_ - BYTES_PER_BUFFER_NO_4LSB
+  };
+
+  // The number of buffers.
+  enum { BUFFERS_PER_TILE = BUFFERS_PER_TILE_ };
+
+  // The size in bytes of total buffers.
+  enum { BYTES_PER_TILE = BYTES_PER_BUFFER * BUFFERS_PER_TILE };
+
+  // The boundary for smem_read_offset and smem_write_offset increment.
+  enum { BYTES_PER_TILE_INC_BOUNDARY = BYTES_PER_TILE - BYTES_PER_BUFFER };
+
+  // The number of threads needed to store a row
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_STS };
+
+  // The number of rows written with a single STS.
+  enum { ROWS_PER_STS = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // for swizzle_128B the xor factor is 8
+  enum { ROWS_PER_XOR_PATTERN = 8 };
+
+  // The distance in byte between different GMMA groups (might need multiple due to cta tile size)
+  // each GMMA group is of size GMMA_M x GMMA_N x Kblock
+  enum {
+    GMMA_GROUP_SMEM_DISTANCE =
+        Mma_tile::K_PER_GMMA_GROUP * NUM_ROWS_PER_GMMA_GROUP_N * BYTES_PER_ROW
+  };
+
+  // The number of STS per ROW.
+  enum { STS_PER_ROW = BYTES_PER_ROW / THREADS_PER_ROW / BYTES_PER_STS };
+
+  // For Hopper, STS_PER_ROW should be 1 (at least for now.)
+  static_assert(STS_PER_ROW == 1, "");
+
+  // Ctor.
+  inline __device__ Smem_tile_hopper_gmma_row_b(char* smem, int tidx)
+      : smem_(__nvvm_get_smem_pointer(smem)) {
+    int smem_write_row = tidx / THREADS_PER_ROW;
+    int smem_write_xor = smem_write_row % ROWS_PER_XOR_PATTERN;
+    int smem_write_col = (tidx % THREADS_PER_ROW) ^ smem_write_xor;
+    this->smem_write_offset_ = smem_write_row * BYTES_PER_ROW + smem_write_col * BYTES_PER_STS;
+    // That code is expected to trigger the utilization of the URF by the compiler.
+    this->smem_read_buffer_ = __shfl_sync(0xffffffff, 0, 0);
+    this->smem_write_buffer_ = __shfl_sync(0xffffffff, 0, 0);
+  }
+
+  // Compute the store pointers.
+  template <int N>
+  inline __device__ void compute_store_pointers(uint32_t (&ptrs)[N]) {
+#pragma unroll
+    for (int ii = 0; ii < N; ++ii) {
+      // Decompose the STS into row/col.
+      int row = ii / STS_PER_ROW;
+      // Assemble the offset.
+      int offset = smem_write_offset_ + row * ROWS_PER_STS * BYTES_PER_ROW;
+
+      // Assemble the final pointer :)
+      ptrs[ii] = smem_ + offset + smem_write_buffer_;
+    }
+  }
+
+  template <int N, int M>
+  inline __device__ void store(void const* (&gmem_ptrs)[N], uint32_t (&preds)[M], uint64_t = 0) {
+    uint32_t smem_ptrs[N];
+    this->compute_store_pointers<N>(smem_ptrs);
+    ldgsts<N, M>(smem_ptrs, gmem_ptrs, preds);
+  }
+
+  // Move the write offset to next buffer.
+  inline __device__ void move_next_write_buffer() {
+    // if( BUFFERS_PER_TILE > 1 ) {
+    //     this->smem_write_offset_ += ( smem_write_offset_ >= BYTES_PER_TILE_INC_BOUNDARY )
+    //                                     ? -BYTES_PER_TILE_INC_BOUNDARY
+    //                                     : BYTES_PER_BUFFER;
+    // }
+  }
+
+  inline __device__ void move_next_write_buffer(int) {}
+
+  // Move the read offset to next buffer.
+  // do nothing, as it is controlled by gmma desc
+  inline __device__ void move_next_read_buffer() {}
+
+  // The shared memory pointer.
+  uint32_t smem_;
+  // The read offset. Reserve 4 offsets if needed.
+  int smem_read_offset_;
+  // The write offset.
+  int smem_write_offset_;
+  // The buffer base offset for read.
+  int smem_read_buffer_;
+  // The buffer base offset for write.
+  int smem_write_buffer_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Specialized Interface
+// LDGSTS smem tiles.
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// A Col Major, A coming from SMEM
+template <typename Traits, typename Cta_tile, int BUFFERS_PER_TILE_,
+          fmha::Gmma_descriptor_mode desc_mode>
+struct Smem_tile_hopper_a<Traits, Cta_tile, fmha::Col, BUFFERS_PER_TILE_, desc_mode, false,
+                          true  // use ldgsts
+                          >
+    : public Smem_tile_hopper_gmma_col_a<Traits, Cta_tile, BUFFERS_PER_TILE_, desc_mode> {
+  // The base class.
+  using Base = Smem_tile_hopper_gmma_col_a<Traits, Cta_tile, BUFFERS_PER_TILE_, desc_mode>;
+
+  // Ctor.
+  // comment the implementation out as a mark that this is not supported, yet.
+  // inline __device__ Smem_tile_hopper_a( char *smem, int tidx ) : Base( smem, tidx ) {
+  //}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// A Row Major, A coming from SMEM
+template <typename Traits, typename Cta_tile, int BUFFERS_PER_TILE_,
+          fmha::Gmma_descriptor_mode desc_mode>
+struct Smem_tile_hopper_a<Traits, Cta_tile, fmha::Row, BUFFERS_PER_TILE_, desc_mode, false,
+                          true  // use ldgsts
+                          >
+    : public Smem_tile_hopper_gmma_row_a<Traits, Cta_tile, BUFFERS_PER_TILE_, desc_mode> {
+  // The base class.
+  using Base = Smem_tile_hopper_gmma_row_a<Traits, Cta_tile, BUFFERS_PER_TILE_, desc_mode>;
+
+  // Ctor.
+  inline __device__ Smem_tile_hopper_a(char* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// B Col Major, B coming from SMEM
+template <typename Traits, typename Cta_tile, int BUFFERS_PER_TILE_,
+          fmha::Gmma_descriptor_mode desc_mode>
+struct Smem_tile_hopper_b<Traits, Cta_tile, fmha::Col, BUFFERS_PER_TILE_, desc_mode, false,
+                          true  // use ldgsts
+                          >
+    : public Smem_tile_hopper_gmma_col_b<Traits, Cta_tile, BUFFERS_PER_TILE_, desc_mode> {
+  // The base class.
+  using Base = Smem_tile_hopper_gmma_col_b<Traits, Cta_tile, BUFFERS_PER_TILE_, desc_mode>;
+
+  // Ctor.
+  inline __device__ Smem_tile_hopper_b(char* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// B Row Major, B coming from SMEM
+template <typename Traits, typename Cta_tile, int BUFFERS_PER_TILE_,
+          fmha::Gmma_descriptor_mode desc_mode>
+struct Smem_tile_hopper_b<Traits, Cta_tile, fmha::Row, BUFFERS_PER_TILE_, desc_mode, false,
+                          true  // use ldgsts
+                          >
+    : public Smem_tile_hopper_gmma_row_b<Traits, Cta_tile, BUFFERS_PER_TILE_, desc_mode> {
+  // The base class.
+  using Base = Smem_tile_hopper_gmma_row_b<Traits, Cta_tile, BUFFERS_PER_TILE_, desc_mode>;
+
+  // Ctor.
+  inline __device__ Smem_tile_hopper_b(char* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Specialized Interface
+// TMA smem tiles.
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// A Row Major. For GMMA, A is from SMEM directly.
+template <typename Traits, typename Cta_tile, int BUFFERS_PER_TILE_,
+          fmha::Gmma_descriptor_mode desc_mode>
+struct Smem_tile_hopper_gmma_tma_row_a {
+  // Currently Interleaved Mode is not implemented.
+  static_assert(desc_mode != fmha::Gmma_descriptor_mode::SWIZZLE_NONE,
+                "Currently, SWIZZLE_NONE Mode is not implemented.\n");
+
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of desc within a gmma group (kblock limited).
+  static constexpr fmha::Gmma_descriptor_size GMMA_DESC_SIZE_PER_GROUP =
+      fmha::Gmma_descriptor_size::ONE;
+
+  // The SWIZZLE_128B descriptor.
+  using Gmma_descriptor =
+      fmha::Gmma_descriptor_a<fmha::Gmma_descriptor_transpose::NOTRANS, desc_mode, Cta_tile,
+                              Traits::BITS_PER_ELEMENT_A, Traits::GMMA_M, Traits::GMMA_N,
+                              Traits::GMMA_K, GMMA_DESC_SIZE_PER_GROUP>;
+
+  using Cta_tile_gmma = Cta_tile;
+
+  // the size in bits of each element.
+  enum { BITS_PER_ELEMENT = Traits::BITS_PER_ELEMENT_A };
+
+  // the size of bytes of each element.
+  enum { BYTES_PER_ELEMENT = BITS_PER_ELEMENT / 8 };
+
+  // The size in bytes of a single LDGSTS/STS.
+  enum { BYTES_PER_STS = 16 };
+
+  // The number of elements per LDGSTS/STS.
+  enum { ELEMENTS_PER_STS = BYTES_PER_STS * 8 / BITS_PER_ELEMENT };
+
+  // SMEM layout for GMMA has a leading dim of exact 128 Byte, at least for SWIZZLE_128B
+  // and SWIZZLE_64B format.
+  enum { BYTES_PER_ROW = 128 };
+
+  // the number of rows per one row of K due the the limitation of leading dim size.
+  enum { NUM_ROWS_PER_K = (Cta_tile::K * BYTES_PER_ELEMENT + BYTES_PER_ROW - 1) / BYTES_PER_ROW };
+
+  static_assert(desc_mode != fmha::Gmma_descriptor_mode::SWIZZLE_64B ||
+                    (Cta_tile::K * BYTES_PER_ELEMENT) == 64,
+                "swizzle_64B row_a is valid if kblock=32\n");
+
+  // Number of SMEM rows.
+  enum {
+    NUM_ROWS = (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B)
+                   ? (Cta_tile::M * NUM_ROWS_PER_K)
+                   : (Cta_tile::M / 2)
+  };
+
+  // The size of one buffer in bytes in shared memory.
+  enum { BYTES_PER_BUFFER = NUM_ROWS * BYTES_PER_ROW };
+
+  // the size of one buffer in bytes in shared memory, without the 4 LSB.
+  // this is needed to increment the GMMA desc to the next buffer.
+  enum { BYTES_PER_BUFFER_NO_4LSB = BYTES_PER_BUFFER / 16 };
+
+  // this is needed to decrement GMMA desc.
+  enum {
+    BYTES_PER_BUFFER_INC_BOUNDARY_NO_4LSB =
+        BYTES_PER_BUFFER_NO_4LSB * BUFFERS_PER_TILE_ - BYTES_PER_BUFFER_NO_4LSB
+  };
+
+  // The number of buffers.
+  enum { BUFFERS_PER_TILE = BUFFERS_PER_TILE_ };
+
+  // The size in bytes of total buffers.
+  enum { BYTES_PER_TILE = BYTES_PER_BUFFER * BUFFERS_PER_TILE };
+
+  // The boundary for smem_read_offset and smem_write_offset increment.
+  enum { BYTES_PER_TILE_INC_BOUNDARY = BYTES_PER_TILE - BYTES_PER_BUFFER };
+
+  // The number of threads needed to store a row
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_STS };
+
+  // The number of rows written with a single STS.
+  enum { ROWS_PER_STS = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // for swizzle_128B the xor factor is 8
+  enum { ROWS_PER_XOR_PATTERN = (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B) ? 8 : 4 };
+
+  // The distance in byte between different GMMA groups (might need multiple due to cta tile size)
+  // each GMMA group is of size GMMA_M x GMMA_N x Kblock
+  enum {
+    GMMA_GROUP_SMEM_DISTANCE = Mma_tile::M_PER_GMMA_GROUP /
+                               (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B ? 1 : 2) *
+                               BYTES_PER_ROW
+  };
+
+  // The number of STS per row.
+  enum { STS_PER_ROW = BYTES_PER_ROW / THREADS_PER_ROW / BYTES_PER_STS };
+
+  // For Hopper, STS_PER_ROW should be 1 (at least for now.)
+  static_assert(STS_PER_ROW == 1, "");
+
+  // Each smem barrier is of 8 bytes
+  enum { BYTES_PER_SMEM_BARRIER = 8 };
+
+  // The boundary for smem_read_offset and smem_write_offset increment.
+  enum {
+    BYTES_PER_TILE_INC_BOUNDARY_SMEM_BARRIER =
+        BYTES_PER_SMEM_BARRIER * BUFFERS_PER_TILE - BYTES_PER_SMEM_BARRIER
+  };
+
+  // Ctor.
+  inline __device__ Smem_tile_hopper_gmma_tma_row_a(char* smem, char* smem_barrier)
+      : smem_(__nvvm_get_smem_pointer(smem)),
+        smem_barrier_(__nvvm_get_smem_pointer(smem_barrier)),
+        smem_write_offset_(0),
+        smem_barrier_offset_(0) {}
+
+  // Move the write offset to next buffer.
+  // Also move the smem_barrier.
+  inline __device__ void move_next_write_buffer() {
+    if (BUFFERS_PER_TILE > 1) {
+      this->smem_write_offset_ += (smem_write_offset_ >= BYTES_PER_TILE_INC_BOUNDARY)
+                                      ? -BYTES_PER_TILE_INC_BOUNDARY
+                                      : BYTES_PER_BUFFER;
+    }
+
+    // also update the smem_barrier.
+    if (BUFFERS_PER_TILE > 1) {
+      this->smem_barrier_offset_ +=
+          (smem_barrier_offset_ >= BYTES_PER_TILE_INC_BOUNDARY_SMEM_BARRIER)
+              ? -BYTES_PER_TILE_INC_BOUNDARY_SMEM_BARRIER
+              : BYTES_PER_SMEM_BARRIER;
+    }
+  }
+
+  inline __device__ void move_next_write_buffer(int) {}
+
+  // Move the read offset to next buffer.
+  // do nothing, as it is controlled by gmma desc
+  inline __device__ void move_next_read_buffer() {}
+
+  template <int DIM, cudaTmaDescType DESC_TYPE>
+  inline __device__ void store(cudaTmaDesc const* p_desc, int32_t const (&coord)[DIM],
+                               uint16_t filter_offsets = 0, uint16_t mcast_cta_mask = 0) {
+    fmha::utmaldg<DIM, DESC_TYPE, false>(p_desc, smem_ + smem_write_offset_,
+                                         smem_barrier_ + smem_barrier_offset_, coord);
+  }
+
+  // The shared memory pointer.
+  uint32_t smem_;
+  // The barrier in smem.
+  uint32_t smem_barrier_;
+  // The write offset.
+  int smem_write_offset_;
+  // The smem barrier offset
+  int smem_barrier_offset_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// B Col Major. For GMMA, B is from SMEM directly.
+template <typename Traits, typename Cta_tile, int BUFFERS_PER_TILE_,
+          fmha::Gmma_descriptor_mode desc_mode>
+struct Smem_tile_hopper_gmma_tma_col_b {
+  // Currently Interleaved Mode is not implemented.
+  static_assert(desc_mode != fmha::Gmma_descriptor_mode::SWIZZLE_NONE,
+                "Currently, Interleaved Mode is not implemented.\n");
+
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of desc within a gmma group (kblock limited)
+  static constexpr fmha::Gmma_descriptor_size GMMA_DESC_SIZE_PER_GROUP =
+      fmha::Gmma_descriptor_size::ONE;
+
+  // The SWIZZLE_128B descriptor
+  using Gmma_descriptor =
+      fmha::Gmma_descriptor_b<fmha::Gmma_descriptor_transpose::NOTRANS, desc_mode, Cta_tile,
+                              Traits::BITS_PER_ELEMENT_B, Traits::GMMA_M, Traits::GMMA_N,
+                              Traits::GMMA_K, GMMA_DESC_SIZE_PER_GROUP>;
+
+  using Cta_tile_gmma = Cta_tile;
+
+  // the size in bits of each element.
+  enum { BITS_PER_ELEMENT = Traits::BITS_PER_ELEMENT_B };
+
+  // the size of bytes of each element.
+  enum { BYTES_PER_ELEMENT = BITS_PER_ELEMENT / 8 };
+
+  // The size in bytes of a single LDGSTS/STS.
+  enum { BYTES_PER_STS = 16 };
+
+  // The number of elements per LDGSTS/STS.
+  enum { ELEMENTS_PER_STS = BYTES_PER_STS * 8 / BITS_PER_ELEMENT };
+
+  // SMEM layout for GMMA has a leading dim of exact 128 Byte, at least for SWIZZLE_128B and
+  // SWIZZLE_64B format
+  enum { BYTES_PER_COLUMN = 128 };
+
+  static_assert(desc_mode != fmha::Gmma_descriptor_mode::SWIZZLE_64B ||
+                    (Cta_tile::K * BYTES_PER_ELEMENT) == 64,
+                "swizzle_64B col_b is valid if kblock=32\n");
+
+  // the number of columns per one column of K due the the limitation of leading dim size
+  enum {
+    NUM_COLS_PER_K = (Cta_tile::K * BYTES_PER_ELEMENT + BYTES_PER_COLUMN - 1) / BYTES_PER_COLUMN
+  };
+
+  // Number of SMEM columns.
+  enum {
+    NUM_COLUMNS = (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B)
+                      ? Cta_tile::N * NUM_COLS_PER_K
+                      : Cta_tile::N / 2
+  };
+
+  // The size of one buffer in bytes in shared memory.
+  enum { BYTES_PER_BUFFER = NUM_COLUMNS * BYTES_PER_COLUMN };
+
+  // the size of one buffer in bytes in shared memory, without the 4 LSB.
+  // this is needed to increment the GMMA desc to the next buffer
+  enum { BYTES_PER_BUFFER_NO_4LSB = BYTES_PER_BUFFER / 16 };
+
+  // this is needed to decrement GMMA desc.
+  enum {
+    BYTES_PER_BUFFER_INC_BOUNDARY_NO_4LSB =
+        BYTES_PER_BUFFER_NO_4LSB * BUFFERS_PER_TILE_ - BYTES_PER_BUFFER_NO_4LSB
+  };
+
+  // The number of buffers.
+  enum { BUFFERS_PER_TILE = BUFFERS_PER_TILE_ };
+
+  // The size in bytes of total buffers.
+  enum { BYTES_PER_TILE = BYTES_PER_BUFFER * BUFFERS_PER_TILE };
+
+  // The boundary for smem_read_offset and smem_write_offset increment.
+  enum { BYTES_PER_TILE_INC_BOUNDARY = BYTES_PER_TILE - BYTES_PER_BUFFER };
+
+  // The number of threads needed to store a column.
+  enum { THREADS_PER_COLUMN = BYTES_PER_COLUMN / BYTES_PER_STS };
+
+  // The number of columns written with a single STS.
+  enum { COLUMNS_PER_STS = Cta_tile::THREADS_PER_CTA / THREADS_PER_COLUMN };
+
+  // for swizzle_128B the xor factor is 8.
+  enum {
+    COLUMNS_PER_XOR_PATTERN = (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B) ? 8 : 4
+  };
+
+  // The distance in byte between different GMMA groups (might need multiple due to cta tile size)
+  // each GMMA group is of size GMMA_M x GMMA_N x Kblock
+  enum {
+    GMMA_GROUP_SMEM_DISTANCE = Mma_tile::N_PER_GMMA_GROUP /
+                               (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B ? 1 : 2) *
+                               BYTES_PER_COLUMN
+  };
+
+  // The number of STS per column.
+  enum { STS_PER_COLUMN = BYTES_PER_COLUMN / THREADS_PER_COLUMN / BYTES_PER_STS };
+
+  // For Hopper, STS_PER_COLUMN should be 1 (at least for now.)
+  static_assert(STS_PER_COLUMN == 1, "");
+
+  // Ctor.
+  inline __device__ Smem_tile_hopper_gmma_tma_col_b(char* smem, char* smem_barrier)
+      : smem_(__nvvm_get_smem_pointer(smem)),
+        smem_barrier_(__nvvm_get_smem_pointer(smem_barrier)) {}
+
+  // Move the write offset to next buffer.
+  // Not implemented as it is not needed currently.
+  inline __device__ void move_next_write_buffer() {}
+
+  inline __device__ void move_next_write_buffer(int) {}
+
+  // Move the read offset to next buffer.
+  // do nothing, as it is controlled by gmma desc
+  inline __device__ void move_next_read_buffer() {}
+
+  template <int DIM, cudaTmaDescType DESC_TYPE>
+  inline __device__ void store(cudaTmaDesc const* p_desc, int32_t const (&coord)[DIM],
+                               uint16_t filter_offsets = 0, uint16_t mcast_cta_mask = 0) {
+    fmha::utmaldg<DIM, DESC_TYPE, false>(p_desc, smem_, smem_barrier_, coord);
+  }
+
+  // The shared memory pointer.
+  uint32_t smem_;
+  // The barrier in smem.
+  uint32_t smem_barrier_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// B Row Major. For GMMA, B is from SMEM directly.
+template <typename Traits, typename Cta_tile, int BUFFERS_PER_TILE_,
+          fmha::Gmma_descriptor_mode desc_mode>
+struct Smem_tile_hopper_gmma_tma_row_b {
+  // Currently Interleaved Mode is not implemented.
+  static_assert(desc_mode != fmha::Gmma_descriptor_mode::SWIZZLE_NONE,
+                "Currently, Interleaved Mode is not implemented.\n");
+
+  // For SWIZZLE_64B, row b is not needed/implemented
+  static_assert(desc_mode != fmha::Gmma_descriptor_mode::SWIZZLE_64B,
+                "Currently, for SWIZZLE_64B mode, row_b is not needed/implemented. \n");
+
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of desc within a gmma group (kblock limited)
+  static constexpr fmha::Gmma_descriptor_size GMMA_DESC_SIZE_PER_GROUP =
+      fmha::Gmma_descriptor_size::ONE;
+
+  // The SWIZZLE_128B descriptor
+  using Gmma_descriptor =
+      fmha::Gmma_descriptor_b<fmha::Gmma_descriptor_transpose::TRANS, desc_mode, Cta_tile,
+                              Traits::BITS_PER_ELEMENT_B, Traits::GMMA_M, Traits::GMMA_N,
+                              Traits::GMMA_K, GMMA_DESC_SIZE_PER_GROUP>;
+
+  using Cta_tile_gmma = Cta_tile;
+
+  // the size in bits of each element.
+  enum { BITS_PER_ELEMENT = Traits::BITS_PER_ELEMENT_B };
+
+  // the size of bytes of each element.
+  enum { BYTES_PER_ELEMENT = BITS_PER_ELEMENT / 8 };
+
+  // The size in bytes of a single LDGSTS/STS.
+  enum { BYTES_PER_STS = 16 };
+
+  // The number of elements per LDGSTS/STS.
+  enum { ELEMENTS_PER_STS = BYTES_PER_STS * 8 / BITS_PER_ELEMENT };
+
+  // SMEM layout for GMMA has a leading dim of exact 128 Byte, at least for SWIZZLE_128B and
+  // SWIZZLE_64B format
+  enum { BYTES_PER_ROW = 128 };
+
+  // the number of rows per one row of N due the the limitation of leading dim size
+  enum { NUM_ROWS_PER_N = (Cta_tile::N * BYTES_PER_ELEMENT + BYTES_PER_ROW - 1) / BYTES_PER_ROW };
+
+  // the number of rows per one row of N_PER_GMMA_GROUP
+  enum {
+    NUM_ROWS_PER_GMMA_GROUP_N =
+        (Mma_tile::N_PER_GMMA_GROUP * BYTES_PER_ELEMENT + BYTES_PER_ROW - 1) / BYTES_PER_ROW
+  };
+
+  // Number of SMEM rows
+  enum { NUM_ROWS = Cta_tile::K * NUM_ROWS_PER_N };
+
+  // The size of one buffer in bytes in shared memory.
+  enum { BYTES_PER_BUFFER = NUM_ROWS * BYTES_PER_ROW };
+
+  // the size of one buffer in bytes in shared memory, without the 4 LSB.
+  // this is needed to increment the GMMA desc to the next buffer
+  enum { BYTES_PER_BUFFER_NO_4LSB = BYTES_PER_BUFFER / 16 };
+
+  // this is needed to decrement GMMA desc
+  enum {
+    BYTES_PER_BUFFER_INC_BOUNDARY_NO_4LSB =
+        BYTES_PER_BUFFER_NO_4LSB * BUFFERS_PER_TILE_ - BYTES_PER_BUFFER_NO_4LSB
+  };
+
+  // The number of buffers.
+  enum { BUFFERS_PER_TILE = BUFFERS_PER_TILE_ };
+
+  // The size in bytes of total buffers.
+  enum { BYTES_PER_TILE = BYTES_PER_BUFFER * BUFFERS_PER_TILE };
+
+  // The boundary for smem_read_offset and smem_write_offset increment.
+  enum { BYTES_PER_TILE_INC_BOUNDARY = BYTES_PER_TILE - BYTES_PER_BUFFER };
+
+  // The number of threads needed to store a row
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_STS };
+
+  // The number of rows written with a single STS.
+  enum { ROWS_PER_STS = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // for swizzle_128B the xor factor is 8
+  enum { ROWS_PER_XOR_PATTERN = 8 };
+
+  // The distance in byte between different GMMA groups (might need multiple due to cta tile size)
+  // each GMMA group is of size GMMA_M x GMMA_N x Kblock
+  enum {
+    GMMA_GROUP_SMEM_DISTANCE =
+        Mma_tile::K_PER_GMMA_GROUP * NUM_ROWS_PER_GMMA_GROUP_N * BYTES_PER_ROW
+  };
+
+  // The number of STS per ROW.
+  enum { STS_PER_ROW = BYTES_PER_ROW / THREADS_PER_ROW / BYTES_PER_STS };
+
+  // For Hopper, STS_PER_ROW should be 1 (at least for now.)
+  static_assert(STS_PER_ROW == 1, "");
+
+  // Ctor.
+  inline __device__ Smem_tile_hopper_gmma_tma_row_b(char* smem, char* smem_barrier)
+      : smem_(__nvvm_get_smem_pointer(smem)),
+        smem_barrier_(__nvvm_get_smem_pointer(smem_barrier)) {}
+
+  // Move the write offset to next buffer.
+  // Not implemented since it is not needed at the moment.
+  inline __device__ void move_next_write_buffer() {}
+
+  inline __device__ void move_next_write_buffer(int) {}
+
+  // Move the read offset to next buffer.
+  // do nothing, as it is controlled by gmma desc
+  inline __device__ void move_next_read_buffer() {}
+
+  template <int DIM, cudaTmaDescType DESC_TYPE>
+  inline __device__ void store(cudaTmaDesc const* p_desc, int32_t const (&coord)[DIM],
+                               uint16_t filter_offsets = 0, uint16_t mcast_cta_mask = 0) {
+    fmha::utmaldg<DIM, DESC_TYPE, false>(p_desc, smem_, smem_barrier_, coord);
+  }
+
+  // The shared memory pointer.
+  uint32_t smem_;
+  // The barrier in smem.
+  uint32_t smem_barrier_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// A Row Major, A coming from SMEM
+template <typename Traits, typename Cta_tile, int BUFFERS_PER_TILE_,
+          fmha::Gmma_descriptor_mode desc_mode>
+struct Smem_tile_hopper_a<Traits, Cta_tile, fmha::Row, BUFFERS_PER_TILE_, desc_mode, false,
+                          false  // will be use tma
+                          >
+    : public Smem_tile_hopper_gmma_tma_row_a<Traits, Cta_tile, BUFFERS_PER_TILE_, desc_mode> {
+  // The base class.
+  using Base = Smem_tile_hopper_gmma_tma_row_a<Traits, Cta_tile, BUFFERS_PER_TILE_, desc_mode>;
+
+  // Ctor.
+  inline __device__ Smem_tile_hopper_a(char* smem, char* smem_barrier) : Base(smem, smem_barrier) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// B Col Major, B coming from SMEM
+template <typename Traits, typename Cta_tile, int BUFFERS_PER_TILE_,
+          fmha::Gmma_descriptor_mode desc_mode>
+struct Smem_tile_hopper_b<Traits, Cta_tile, fmha::Col, BUFFERS_PER_TILE_, desc_mode, false,
+                          false  // will be use tma
+                          >
+    : public Smem_tile_hopper_gmma_tma_col_b<Traits, Cta_tile, BUFFERS_PER_TILE_, desc_mode> {
+  // The base class.
+  using Base = Smem_tile_hopper_gmma_tma_col_b<Traits, Cta_tile, BUFFERS_PER_TILE_, desc_mode>;
+
+  // Ctor.
+  inline __device__ Smem_tile_hopper_b(char* smem, char* smem_barrier) : Base(smem, smem_barrier) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// B Row Major, B coming from SMEM
+template <typename Traits, typename Cta_tile, int BUFFERS_PER_TILE_,
+          fmha::Gmma_descriptor_mode desc_mode>
+struct Smem_tile_hopper_b<Traits, Cta_tile, fmha::Row, BUFFERS_PER_TILE_, desc_mode, false,
+                          false  // will be use tma
+                          >
+    : public Smem_tile_hopper_gmma_tma_row_b<Traits, Cta_tile, BUFFERS_PER_TILE_, desc_mode> {
+  // The base class.
+  using Base = Smem_tile_hopper_gmma_tma_row_b<Traits, Cta_tile, BUFFERS_PER_TILE_, desc_mode>;
+
+  // Ctor.
+  inline __device__ Smem_tile_hopper_b(char* smem, char* smem_barrier) : Base(smem, smem_barrier) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace wip
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits_,
+    // The description of the tile computed by this CTA.
+    typename Cta_tile_,
+    // The layout of the tile.
+    typename Layout_,
+    // The number of bytes per STS.
+    int BYTES_PER_STS_,
+    // The number of buffers. (Used in multistage and double buffer cases.)
+    int BUFFERS_PER_TILE_,
+    // GMMA descriptor mode
+    fmha::Gmma_descriptor_mode desc_mode,
+    // Whether to use TMA.
+    bool USE_TMA,
+    // Whether A is coming for RF.
+    bool GMMA_A_RF = Traits_::GMMA_A_RF>
+struct Smem_tile_hopper_a : public fmha::Smem_tile_without_skews<
+                                Cta_tile_, Layout_::COL ? Cta_tile_::K : Cta_tile_::M,
+                                Layout_::COL ? Cta_tile_::M : Cta_tile_::K,
+                                Traits_::BITS_PER_ELEMENT_A, BYTES_PER_STS_, BUFFERS_PER_TILE_, 0,
+                                (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B  ? 8
+                                 : desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_64B ? 4
+                                                                                        : 2),
+                                1, true, USE_TMA, 128 * 8 / Traits_::BITS_PER_ELEMENT_A> {
+  using Traits = Traits_;
+  using Cta_tile = Cta_tile_;
+  // The base class.
+  using Base = fmha::Smem_tile_without_skews<
+      Cta_tile, Layout_::COL ? Cta_tile::K : Cta_tile::M, Layout_::COL ? Cta_tile::M : Cta_tile::K,
+      Traits::BITS_PER_ELEMENT_A, BYTES_PER_STS_, BUFFERS_PER_TILE_, 0,
+      (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B  ? 8
+       : desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_64B ? 4
+                                                              : 2),
+      1, true, USE_TMA, 128 * 8 / Traits::BITS_PER_ELEMENT_A>;
+
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The layout
+  using Layout = Layout_;
+  // The fragment.
+  using Fragment = fmha::Fragment_a<Traits, Layout>;
+
+  // The number of desc within a gmma group (kblock limited)
+  static constexpr fmha::Gmma_descriptor_size GMMA_DESC_SIZE_PER_GROUP =
+      fmha::Gmma_descriptor_size::ONE;
+  // The SWIZZLE_128B descriptor
+  using Gmma_descriptor =
+      fmha::Gmma_descriptor_a<Layout::COL ? fmha::Gmma_descriptor_transpose::TRANS
+                                          : fmha::Gmma_descriptor_transpose::NOTRANS,
+                              desc_mode, Cta_tile, Traits::BITS_PER_ELEMENT_A, Traits::GMMA_M,
+                              Traits::GMMA_N, Traits::GMMA_K, GMMA_DESC_SIZE_PER_GROUP>;
+
+  // the number of columns per one column of M_PER_GMMA_GROUP
+  enum {
+    NUM_COLS_PER_GMMA_GROUP_M =
+        (Mma_tile::M_PER_GMMA_GROUP * Base::BITS_PER_ELEMENT / 8 + Base::BYTES_PER_ROW - 1) /
+        Base::BYTES_PER_ROW
+  };
+
+  // The distance in byte between different GMMA groups (might need multiple due to cta tile size)
+  // each GMMA group is of size GMMA_M x GMMA_N x Kblock
+  static constexpr int GMMA_GROUP_SMEM_DISTANCE =
+      Layout::COL ? (Mma_tile::K_PER_GMMA_GROUP * NUM_COLS_PER_GMMA_GROUP_M * Base::BYTES_PER_ROW *
+                     Cta_tile::WARP_GROUP_M)
+                  : (Mma_tile::M_PER_GMMA_GROUP * Cta_tile::WARP_GROUP_M /
+                     (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B  ? 1
+                      : desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_64B ? 2
+                                                                             : 4) *
+                     Base::BYTES_PER_ROW);
+
+  // the size of one buffer in bytes in shared memory, without the 4 LSB.
+  // this is needed to increment the GMMA desc to the next buffer
+  enum { BYTES_PER_BUFFER_NO_4LSB = Base::BYTES_PER_BUFFER / 16 };
+
+  // this is needed to decrement GMMA desc
+  enum {
+    BYTES_PER_BUFFER_INC_BOUNDARY_NO_4LSB =
+        BYTES_PER_BUFFER_NO_4LSB * BUFFERS_PER_TILE_ - BYTES_PER_BUFFER_NO_4LSB
+  };
+
+  // Ctor.
+  inline __device__ Smem_tile_hopper_a(void* smem, int tidx) : Base(smem, tidx) {}
+
+  // set the scale and bias smem pointer
+  inline __device__ void set_scale_bias_smem_ptr(char* scale_bias_smem_ptr, int tidx, int k) {}
+
+  // Load from shared memory.
+  template <typename Layout_b>
+  inline __device__ void load(Fragment (&a)[Mma_tile::MMAS_M], int ki) {}
+
+  // Move the read offset to next buffer.
+  // do nothing, as it is controlled by gmma desc
+  inline __device__ void move_next_read_buffer() {}
+
+  // Overload set needs to be replicated for compatibility
+  inline __device__ void move_next_read_buffer(int N) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits_,
+    // The description of the tile computed by this CTA.
+    typename Cta_tile_,
+    // The layout of the tile.
+    typename Layout_,
+    // The number of bytes per STS.
+    int BYTES_PER_STS_,
+    // The number of buffers. (Used in multistage and double buffer cases.)
+    int BUFFERS_PER_TILE_,
+    // GMMA descriptor mode
+    fmha::Gmma_descriptor_mode desc_mode,
+    // USe TMA or not,
+    bool USE_TMA>
+struct Smem_tile_hopper_b
+    : public fmha::Smem_tile_without_skews<
+          Cta_tile_,
+          Layout_::COL ? Cta_tile_::N : Cta_tile_::K,  // ROWS
+          Layout_::COL ? Cta_tile_::K : Cta_tile_::N,  // COLS
+          Traits_::BITS_PER_ELEMENT_B, BYTES_PER_STS_, BUFFERS_PER_TILE_,
+          0,  // LDS_FAST_PATH
+          // Determine ROWS_PER_XOR_PATTERN from the swizzle mode:
+          (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B  ? 8
+           : desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_64B ? 4
+                                                                  : /* 32B or NONE */ 2),
+          1,     // COLS_PER_XOR_PATTERN
+          true,  // USE_PREDICATES
+          USE_TMA,
+          128 * 8 / Traits_::BITS_PER_ELEMENT_B  // LEAD_DIM_ELEMENTS
+          > {
+  using Traits = Traits_;
+  using Cta_tile = Cta_tile_;
+  // The base class.
+  using Base = fmha::Smem_tile_without_skews<
+      Cta_tile, Layout_::COL ? Cta_tile::N : Cta_tile::K, Layout_::COL ? Cta_tile::K : Cta_tile::N,
+      Traits::BITS_PER_ELEMENT_B, BYTES_PER_STS_, BUFFERS_PER_TILE_, 0,
+      (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B  ? 8
+       : desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_64B ? 4
+                                                              : 2),
+      1, true, USE_TMA, 128 * 8 / Traits::BITS_PER_ELEMENT_B>;
+
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The layout
+  using Layout = Layout_;
+  // The fragment.
+  using Fragment = fmha::Fragment_b<Traits, Layout>;
+
+  // The number of desc within a gmma group (kblock limited)
+  static constexpr fmha::Gmma_descriptor_size GMMA_DESC_SIZE_PER_GROUP =
+      fmha::Gmma_descriptor_size::ONE;
+  // The SWIZZLE_128B descriptor
+  using Gmma_descriptor =
+      fmha::Gmma_descriptor_b<Layout::COL ? fmha::Gmma_descriptor_transpose::NOTRANS
+                                          : fmha::Gmma_descriptor_transpose::TRANS,
+                              desc_mode, Cta_tile, Traits::BITS_PER_ELEMENT_B, Traits::GMMA_M,
+                              Traits::GMMA_N, Traits::GMMA_K, GMMA_DESC_SIZE_PER_GROUP>;
+
+  // the number of rows per one row of N_PER_GMMA_GROUP
+  enum {
+    NUM_ROWS_PER_GMMA_GROUP_N =
+        (Mma_tile::N_PER_GMMA_GROUP * Base::BITS_PER_ELEMENT / 8 + Base::BYTES_PER_ROW - 1) /
+        Base::BYTES_PER_ROW
+  };
+
+  // The distance in byte between different GMMA groups (might need multiple due to cta tile size)
+  // each GMMA group is of size GMMA_M x GMMA_N x Kblock
+
+  // The dimension that we split.
+  // Add buffers when we have multiple buffers for split head dimensions.
+  // Split-d smem view (2 split D, and 3 buffers): d0, d0, d0, d1, d1, d1.
+  static constexpr int GMMA_GROUP_SPLIT_DIM =
+      Layout::COL ? Mma_tile::N_PER_GMMA_GROUP : (Mma_tile::K_PER_GMMA_GROUP * BUFFERS_PER_TILE_);
+
+  // The split factor.
+  static constexpr int GMMA_GROUP_SPLIT_FACTOR =
+      (desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_128B  ? 1
+       : desc_mode == fmha::Gmma_descriptor_mode::SWIZZLE_64B ? 2
+                                                              : 4);
+
+  // Make sure the dimension that we split is a multiple of the split factor.
+  static_assert(GMMA_GROUP_SPLIT_DIM % GMMA_GROUP_SPLIT_FACTOR == 0);
+
+  // The distance between two "groups" in shared memory.
+  static constexpr int GMMA_GROUP_SMEM_DISTANCE =
+      GMMA_GROUP_SPLIT_DIM / GMMA_GROUP_SPLIT_FACTOR * Base::BYTES_PER_ROW;
+
+  // the size of one buffer in bytes in shared memory, without the 4 LSB.
+  // this is needed to increment the GMMA desc to the next buffer
+  enum { BYTES_PER_BUFFER_NO_4LSB = Base::BYTES_PER_BUFFER / 16 };
+
+  // this is needed to decrement GMMA desc
+  enum {
+    BYTES_PER_BUFFER_INC_BOUNDARY_NO_4LSB =
+        BYTES_PER_BUFFER_NO_4LSB * BUFFERS_PER_TILE_ - BYTES_PER_BUFFER_NO_4LSB
+  };
+
+  // Ctor.
+  inline __device__ Smem_tile_hopper_b(void* smem, int tidx) : Base(smem, tidx) {
+    warp_id_ = tidx / 32;
+    lane_id_ = tidx % 32;
+
+    // each pair of warps transposes 8x8 in place
+    // each warp responsible for diagonal 4x4s
+    // calculate index in 8x8 block
+    block_row_ = lane_id_ / 4;
+    block_col_ = (lane_id_ % 4) + ((warp_id_ % 2) ^ (block_row_ / 4)) * 4;
+
+    // diagonal 4x4s will 2x conflict for SWIZZLE_32B
+    // 1 warp per 8x8, 2 4x8 load+store
+    if (Traits::GMMA_N == 8) {
+      block_row_ = lane_id_ / 8;
+      block_col_ = lane_id_ % 8;
+    }
+
+    // offset when all 4 warps participate in transpose
+    block_col_offset_ = (warp_id_ / 2) * 8;
+  }
+
+  int warp_id_, lane_id_;
+  int block_row_, block_col_, block_col_offset_;
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&b)[Mma_tile::MMAS_N], int ki) {}
+
+  // Load from smem, do something (e.g. transpose), then store back to smem
+  inline __device__ void load_and_store(int ki) {
+    /*
+    using B_type = typename Traits::B_type;
+
+    // TODO: move these to B_RF smem tiles
+
+    // 8 channel per group fp16 fprop/dgrad with 64x16x16 gmma
+    // move 8x8 OOB zeros to right diagonal, 8x8 in-bounds weights on left diagonal
+    if (Cta_tile::N_PER_GROUP == 8 && Traits::GMMA_N == 16
+            && Traits::BITS_PER_ELEMENT_B == 16) {
+        // just need to swap 2 cores within a single SWIZZLE_32B, one of which is just zero
+        // 1 LDSM.M88.1
+        if (warp_id_ == 0) {
+            int smem_row_offset = ki * 4 * 128 + 2 * 128; // 4 rows per 16x16, swap the bottom 8x16
+            int lds_block_idx = lane_id_ * 2; // ldsm.m88.1 only uses first 8 threads for address
+            int lds_smem_idx = lds_block_idx ^ (lane_id_ / 4);
+
+            uint32_t data;
+            uint32_t lds_smem_ptr = this->smem_ + this->smem_read_buffer_
+                                    + smem_row_offset
+                                    + lds_smem_idx * 16;
+            fmha::ldsm(data, lds_smem_ptr);
+
+            __syncwarp();
+
+            // move values to adjacent core
+            fmha::stsm(lds_smem_ptr ^ 16, data);
+
+            // set zeros at previous core
+            fmha::stsm(lds_smem_ptr, static_cast<uint32_t>(0));
+        }
+    }
+
+    // 4 channel per group tf32 fprop with 64x8x8 gmma
+    // move 4x4 in-bounds weights on left diagonal, OOB zeros everywhere else
+    if (Cta_tile::N_PER_GROUP == 4 && Traits::GMMA_N == 8
+            && Layout::COL && Traits::BITS_PER_ELEMENT_B == 32) {
+        // just need to swap the bottom 4x8, 1 elt per thread for 1 warp
+        // 1 lds/sts.32 per thread
+        if (warp_id_ == 0) {
+            int smem_row_offset = ki * Base::ROWS_PER_XOR_PATTERN * 128 + 128;
+            int lds_smem_idx = lane_id_;
+            uint32_t lds_ptr = this->smem_ + this->smem_read_buffer_
+                                + smem_row_offset
+                                + lds_smem_idx * sizeof(B_type);
+            uint32_t data;
+            lds(data, lds_ptr);
+
+            __syncwarp();
+
+            sts(lds_ptr ^ 16, data);
+        }
+    }
+
+    // partial transpose of 8xN_PER_GROUP operand for tf32 grouped dgrad
+    // todo: revise this for tf32 grouped wgrad, move to partial specialization
+    static constexpr bool IS_TF32_GROUPED_DGRAD =
+        (Cta_tile::GROUPS_N > 1 && Cta_tile::GROUPS_K > 1 || Cta_tile::N_PER_GROUP == 32)
+            && Layout::ROW && Traits::BITS_PER_ELEMENT_B == 32;
+    if (IS_TF32_GROUPED_DGRAD) {
+        static constexpr int XOR_SCALE = 16 / sizeof(B_type); // 16B swizzle over 4B elements
+        static constexpr int ROWS_PER_128B = kDivUp( 128, Traits::GMMA_N * sizeof(B_type) );
+
+        if (Traits::GMMA_N == 8) {
+            if (warp_id_ == 0) {
+
+            int smem_row_offset = ki * Base::ROWS_PER_XOR_PATTERN * 128;
+            uint32_t data[2];
+
+            #pragma unroll
+            for (int ii = 0; ii < 2; ii++) {
+                // get index in row-major 8x8
+                int lds_block_row = block_row_ + ii * 4;
+                int lds_block_col = block_col_;
+                int lds_block_idx = lds_block_row * 8 + lds_block_col;
+
+                // swizzle
+                int lds_xor_factor = (lds_block_row / ROWS_PER_128B) * XOR_SCALE;
+                int lds_smem_idx = lds_block_idx ^ lds_xor_factor;
+
+                // Load from smem
+                uint32_t lds_ptr = this->smem_ + this->smem_read_buffer_
+                                    + smem_row_offset
+                                    + lds_smem_idx * sizeof(B_type);
+                lds(data[ii], lds_ptr);
+            }
+
+            __syncwarp();
+
+            #pragma unroll
+            for (int ii = 0; ii < 2; ii++) {
+                // get index in col-major 8x8
+                int sts_block_row = block_col_;
+                int sts_block_col = block_row_ + ii * 4;
+                if (Cta_tile::N_PER_GROUP == 4 && ii == 1) {
+                    // place 4x4 weights on diagonal for 4-channel tf32 group dgrad
+                    sts_block_row ^= 4;
+                }
+                int sts_block_idx = sts_block_row * 8 + sts_block_col;
+
+                // swizzle
+                int sts_xor_factor = (sts_block_row / ROWS_PER_128B) * XOR_SCALE;
+                int sts_smem_idx = sts_block_idx ^ sts_xor_factor;
+
+                // store to smem
+                uint32_t sts_ptr = this->smem_ + this->smem_read_buffer_
+                                    + smem_row_offset
+                                    + sts_smem_idx * sizeof(B_type);
+                sts(sts_ptr, data[ii]);
+            }
+
+            } // warp_id == 0
+        } else {
+            // loop over 8x16 blocks
+            #pragma unroll
+            for (int ii = 0; ii < kDivUp(Cta_tile::N_PER_GROUP, 16); ii++) {
+                int smem_row_offset = ki * Base::ROWS_PER_XOR_PATTERN * 128;
+
+                // get index in row-major 8xN_PER_GROUP
+                int lds_block_row = block_row_;
+                int lds_block_col = block_col_ + block_col_offset_ + ii * 16;
+                int lds_block_idx = lds_block_row * Cta_tile::N_PER_GROUP
+                                    + lds_block_col;
+
+                // swizzle
+                int lds_xor_factor = (lds_block_row / ROWS_PER_128B) * XOR_SCALE;
+                int lds_smem_idx = lds_block_idx ^ lds_xor_factor;
+
+                // Load from smem
+                uint32_t lds_ptr = this->smem_ + this->smem_read_buffer_
+                                    + smem_row_offset
+                                    + lds_smem_idx * sizeof(B_type);
+                uint32_t data;
+                lds(data, lds_ptr);
+
+                __syncwarp();
+
+                // get index in row-major 8xN_PER_GROUP with 8x8 in-place transposes
+                int sts_block_row = block_col_;
+                int sts_block_col = block_row_ + block_col_offset_ + ii * 16;
+                int sts_block_idx = sts_block_row * Cta_tile::N_PER_GROUP
+                                    + sts_block_col;
+
+                // swizzle
+                int sts_xor_factor = (sts_block_row / ROWS_PER_128B) * XOR_SCALE;
+                int sts_smem_idx = sts_block_idx ^ sts_xor_factor;
+
+                // store to smem
+                uint32_t sts_ptr = this->smem_ + this->smem_read_buffer_
+                                    + smem_row_offset
+                                    + sts_smem_idx * sizeof(B_type);
+                sts(sts_ptr, data);
+            }
+        }
+    }
+
+    // make sure sts are visible to gmma
+    fence_view_async_shared();
+    */
+  }
+
+  // Move the read offset to next buffer.
+  inline __device__ void move_next_read_buffer() {}
+
+  // Move the read offset to next buffer.
+  inline __device__ void move_next_read_buffer(int buffer_id) {
+    this->smem_read_buffer_ = buffer_id * Base::BYTES_PER_BUFFER;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <  // GMMA instruction shape in M dim
+    int GMMA_M,
+    // GMMA instruction shape in N dim
+    int GMMA_N,
+    // GMMA instruction shape in K dim
+    int GMMA_K,
+    // GMMA A operand coming from RF?
+    bool GMMA_A_RF,
+    // GMMA B operand coming from RF?
+    bool GMMA_B_RF,
+    // The description of the tile computed by this CTA.
+    typename Cta_tile,
+    // GMMA descriptor mode
+    fmha::Gmma_descriptor_mode desc_mode,
+    // Use TMA or not,
+    bool USE_TMA, int BUFFERS_PER_TILE>
+struct Smem_tile_v<fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+                   Cta_tile, BUFFERS_PER_TILE, desc_mode, USE_TMA>
+    : public fmha::Smem_tile_hopper_b<
+          fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+          fmha::Row,
+          16,  // BYTES_PER_STS
+          BUFFERS_PER_TILE, desc_mode, USE_TMA> {
+  static constexpr bool TRANSPOSE = false;
+
+  using Cta_tile_gmma = Cta_tile;
+
+  using Base = fmha::Smem_tile_hopper_b<
+      fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+      fmha::Row,
+      16,  // BYTES_PER_STS
+      BUFFERS_PER_TILE, desc_mode, USE_TMA>;
+
+  inline __device__ Smem_tile_v(void* smem, int tidx) : Base(smem, tidx) {}
+
+  inline __device__ void transpose_tile(int) {
+    // Transpose is fused into HGMMA.
+  }
+
+  inline __device__ void transpose_tile(int, uint32_t, uint32_t) {
+    // Transpose is fused into HGMMA.
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <  // GMMA instruction shape in M dim
+    int GMMA_M,
+    // GMMA instruction shape in N dim
+    int GMMA_N,
+    // GMMA instruction shape in K dim
+    int GMMA_K,
+    // GMMA A operand coming from RF?
+    bool GMMA_A_RF,
+    // GMMA B operand coming from RF?
+    bool GMMA_B_RF,
+    // The description of the tile computed by this CTA.
+    typename Cta_tile,
+    // GMMA descriptor mode
+    fmha::Gmma_descriptor_mode desc_mode,
+    // Use TMA or not,
+    bool USE_TMA, int BUFFERS_PER_TILE>
+struct Smem_tile_v<fmha::Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+                   Cta_tile, BUFFERS_PER_TILE, desc_mode, USE_TMA>
+    : public fmha::Smem_tile_hopper_b<
+          fmha::Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+          fmha::Row,
+          16,                // BYTES_PER_STS
+          BUFFERS_PER_TILE,  // BUFFERS_PER_TILE,
+          desc_mode, USE_TMA> {
+  static constexpr bool TRANSPOSE = false;
+
+  using Cta_tile_gmma = Cta_tile;
+
+  using Base = fmha::Smem_tile_hopper_b<
+      fmha::Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+      fmha::Row,
+      16,                // BYTES_PER_STS
+      BUFFERS_PER_TILE,  // BUFFERS_PER_TILE,
+      desc_mode, USE_TMA>;
+
+  inline __device__ Smem_tile_v(void* smem, int tidx) : Base(smem, tidx) {}
+
+  inline __device__ void transpose_tile(int) {
+    // Transpose is fused into HGMMA.
+  }
+
+  inline __device__ void transpose_tile(int, uint32_t, uint32_t) {
+    // Transpose is fused into HGMMA.
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <  // GMMA instruction shape in M dim
+    int GMMA_M,
+    // GMMA instruction shape in N dim
+    int GMMA_N,
+    // GMMA instruction shape in K dim
+    int GMMA_K,
+    // GMMA A operand coming from RF?
+    bool GMMA_A_RF,
+    // GMMA B operand coming from RF?
+    bool GMMA_B_RF,
+    // The description of the tile computed by this CTA.
+    typename Cta_tile,
+    // GMMA descriptor mode
+    fmha::Gmma_descriptor_mode desc_mode,
+    // Use TMA or not,
+    bool USE_TMA, int BUFFERS_PER_TILE>
+struct Smem_tile_v<fmha::Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+                   Cta_tile, BUFFERS_PER_TILE, desc_mode, USE_TMA>
+    : public fmha::Smem_tile_hopper_b<
+          fmha::Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+          fmha::Row,
+          16,                // BYTES_PER_STS
+          BUFFERS_PER_TILE,  // BUFFERS_PER_TILE,
+          desc_mode, USE_TMA> {
+  static constexpr bool TRANSPOSE = false;
+
+  using Cta_tile_gmma = Cta_tile;
+
+  using Base = fmha::Smem_tile_hopper_b<
+      fmha::Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+      fmha::Row,
+      16,                // BYTES_PER_STS
+      BUFFERS_PER_TILE,  // BUFFERS_PER_TILE,
+      desc_mode, USE_TMA>;
+
+  inline __device__ Smem_tile_v(void* smem, int tidx) : Base(smem, tidx) {}
+
+  inline __device__ void transpose_tile(int) {
+    // Transpose is fused into HGMMA.
+  }
+
+  inline __device__ void transpose_tile(int, uint32_t, uint32_t) {
+    // Transpose is fused into HGMMA.
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int L, int UNROLL_N = 1>
+struct Transposer {};
+
+template <typename Traits, typename Cta_tile, int UNROLL_N>
+struct Transposer<Traits, Cta_tile, 128, UNROLL_N> {
+  static_assert(Cta_tile::K % 128 == 0);
+
+  enum {
+    WARPS_M = Cta_tile::WARPS_M,
+    WARPS_N = Cta_tile::WARPS_N,
+    WARPS_K = Cta_tile::WARPS_K,
+  };
+
+  enum {
+    WARPS_4x1x1 = (WARPS_M == 4 && WARPS_N == 1 && WARPS_K == 1),
+    WARPS_4x1x2 = (WARPS_M == 4 && WARPS_N == 1 && WARPS_K == 2),
+  };
+
+  enum { BYTES_PER_LDS = 16 };
+
+  enum { BYTES_PER_ROW = 128 };
+
+  // D=64 and 4 warps.
+  // Per warp we load 32 rows x 16 columns with LDSM.Tx4, 128 rows per CTA.
+  enum { S = Cta_tile::K >= 128 ? 128 : Cta_tile::K };  // The sequence length.
+
+  enum { D = Cta_tile::N >= 128 ? 128 : Cta_tile::N };  // The head dimension.
+
+  // static_assert(S % 128 == 0);
+  static_assert(WARPS_4x1x1 || WARPS_4x1x2);
+  static_assert(D % (BYTES_PER_LDS * WARPS_K) == 0);
+
+  enum { ROWS_PER_LDSM_PER_CTA_WITHOUT_PACKING = 128 };  // LDSMx4
+
+  enum { ROW_PACKING = BYTES_PER_ROW / (D * sizeof(typename Traits::B_type)) };
+
+  enum { ROWS_PER_LDSM_PER_CTA = ROWS_PER_LDSM_PER_CTA_WITHOUT_PACKING / ROW_PACKING };
+
+  enum { ROWS_PER_XOR_PATTERN = fmha::Rows_per_xor_pattern_ampere_b<Traits, S>::VALUE };
+
+  static_assert(ROWS_PER_XOR_PATTERN == 8);
+
+  // The number of loads in K dimension.
+  enum { K = S / ROWS_PER_LDSM_PER_CTA_WITHOUT_PACKING };
+
+  // static_assert(K * ROWS_PER_LDSM_PER_CTA_WITHOUT_PACKING == S);
+  // static_assert(K == 3);
+  //  The number of loads in the D dimension.
+  enum { N = D / (BYTES_PER_LDS * WARPS_K) };  // 16 bytes per load
+
+  static_assert(N * BYTES_PER_LDS * WARPS_K == D);
+
+  uint4 regs_[UNROLL_N][K];
+
+  uint32_t read_offset_;
+  uint32_t write_offset_;
+  uint32_t smem_read_loc_;
+  uint32_t smem_write_loc_;
+
+  inline __device__ Transposer(int tidx) {
+    int read_row, read_col;
+
+    if (WARPS_4x1x1 && N == 8) {  // D=128, 1 warp  in N
+      read_row = (tidx & 0x7f);
+      read_col = (tidx & 0x07);
+    } else if (WARPS_4x1x1 && N == 4) {  // D=64, 1 warp  in N
+      read_row = (tidx & 0xe0) / 2 + (tidx & 0x1e) / 2;
+      read_col = (tidx & 0x01) * 4 + (tidx & 0x06) / 2;
+    } else if (WARPS_4x1x1 && N == 2) {  // D=32, 1 warp  in N
+      read_row = (tidx & 0x60) / 4 + (tidx & 0x1c) / 4;
+      read_col = (tidx & 0x03) * 2;
+      read_col ^= (read_row & 0x01);
+    } else if (WARPS_4x1x2 && N == 4) {  // D=128, 2 warps in N
+      read_row = (tidx & 0x7f);
+      read_col = (tidx & 0x07);
+      // For two warpgroups we do two steps in N at once.
+      read_col ^= (tidx & 0x80) / 128;
+    } else if (WARPS_4x1x2 && N == 2) {  // D=64, 2 warps in N
+      read_row = (tidx & 0x60) / 2 + (tidx & 0x1e) / 2;
+      read_col = (tidx & 0x01) * 4 + (tidx & 0x06) / 2;
+      // For two warpgroups we do two steps in N at once.
+      read_col ^= (tidx & 0x80) / 128;
+    } else if (WARPS_4x1x2 && N == 1) {  // D=32, 2 warps  in N
+      read_row = (tidx & 0x60) / 4 + (tidx & 0x1c) / 4;
+      read_col = (tidx & 0x03) * 2;
+      read_col ^= (read_row & 0x01);
+      // For two warpgroups we do two steps in N at once.
+      read_col ^= (tidx & 0x80) / 128;
+    } else {
+      assert(false);
+    }
+
+    read_offset_ = read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+
+    int write_row, write_col;
+    if (WARPS_4x1x1) {  // swizzle_128byte
+      write_row = (tidx & 0x10) / 2 + (tidx & 0x07);
+      write_col = (tidx & 0x60) / 16 + (tidx & 0x08) / 8;
+    } else if (WARPS_4x1x2) {
+      // Same as above, with second warp group writing next 16 rows.
+      write_row = (tidx & 0x80) / 8 + (tidx & 0x10) / 2 + (tidx & 0x07);
+      write_col = (tidx & 0x60) / 16 + (tidx & 0x08) / 8;
+    } else {
+      assert(false);
+    }
+
+    write_col ^= (write_row & 0x07);
+
+    write_offset_ = write_row * BYTES_PER_ROW + write_col * BYTES_PER_LDS;
+  }
+
+  inline __device__ void transpose(int tidx, uint32_t smem) { transpose_<true>(tidx, smem, smem); }
+
+  template <bool SYNC>
+  inline __device__ void transpose_(uint32_t smem_src, uint32_t smem_dst) {
+#pragma unroll
+    for (int n_begin = 0; n_begin < N; n_begin += UNROLL_N) {
+      transpose_ldmatrix(n_begin, smem_src);
+      transpose_stmatrix<SYNC>(n_begin, smem_dst);
+    }
+  }
+
+  inline __device__ void transpose_ldmatrix(int n_begin, uint32_t smem_src) {
+    static_assert(N % UNROLL_N == 0, "");
+
+    uint4 tmp[UNROLL_N][K];
+    if (n_begin == 0) {
+      smem_read_loc_ = smem_src + read_offset_;
+    }
+
+#pragma unroll
+    for (int ni = n_begin; ni < n_begin + UNROLL_N; ni++) {
+      int const nii = ni - n_begin;
+#pragma unroll
+      for (int ki = 0; ki < K; ki++) {  // 2
+        fmha::ldsmt(tmp[nii][ki], smem_read_loc_ + ki * ROWS_PER_LDSM_PER_CTA * BYTES_PER_ROW);
+      }
+
+      if (WARPS_4x1x1 && N == 4) {  // D=64, 1 warp  in N
+        smem_read_loc_ ^= (ni % 2 == 0 ? 1 : 3) * 16;
+      } else if (WARPS_4x1x1 && N == 2) {  // D=32, 1 warp  in N
+        smem_read_loc_ ^= 16;
+      } else if (WARPS_4x1x2 && N == 2) {  // D=64, 2 warps in N
+        smem_read_loc_ ^= 32;
+      } else if (WARPS_4x1x2 && N == 4) {  // D=128, 2 warps in N
+        smem_read_loc_ ^= (ni % 2 == 0 ? 1 : 3) * 32;
+      } else if (WARPS_4x1x1 && N == 8) {  // D=128, 1 warp  in N
+        smem_read_loc_ ^= ((ni % 4 == 3) ? 7 : (ni % 2 == 1 ? 3 : 1)) * 16;
+      } else if (N != 1) {
+        assert(false);
+      }
+    }
+
+#pragma unroll
+    for (int ni = n_begin; ni < n_begin + UNROLL_N; ni++) {
+      int const nii = ni - n_begin;
+#pragma unroll
+      for (int ki = 0; ki < K; ki++) {
+        fmha::swizzle_rows(regs_[nii][ki].x, regs_[nii][ki].z, tmp[nii][ki].x,
+                           tmp[nii][ki].y);  // PRMT 0+1
+        fmha::swizzle_rows(regs_[nii][ki].y, regs_[nii][ki].w, tmp[nii][ki].z,
+                           tmp[nii][ki].w);  // PRMT 2+3
+      }
+    }
+  }
+
+  template <bool SYNC>
+  inline __device__ void transpose_stmatrix(int n_begin, uint32_t smem_dst) {
+    // After LDSM.Tx4 registers hold 2x2 elts:
+    // [00, 01]
+    // [10, 11]
+    // With row offsets
+    // x: + 0
+    // y: + 8
+    // z: +16 (g)
+    // w: +24 (o)
+    //
+    // After PRMT 0, the :
+    // [00, 01] [80, 81] => x: [00, 10, 80, 90], i.e. col 0
+    // [10, 11] [90, 91] => z: [01, 11, 81, 91], i.e. col 1
+    //
+    // [g0, g1] [o0, o1] => y: [g0, h0, o0, p0], i.e. col 0
+    // [h0, h1] [p0, p1] => w: [g1, h1, o1, p1], i.e. col 1
+    //
+    // Therefore, when looking at the transpose, quad q holds cols 2 * q + [0, 1], i.e.
+    // - quad 0 holds cols 0, 1
+    // - quad 1 holds cols 2, 3
+    // - etc.
+    //
+    // This fits with the accumulator layout, since N strides in steps of 8 per thread.
+
+    if (SYNC) {         // needed if src and dst are the same.
+      __syncthreads();  // LDSM.T done. We should now have a D x S tile in registers. SMEM can be
+                        // written.
+    }
+
+    if (n_begin == 0) {
+      smem_write_loc_ = smem_dst + write_offset_;
+    }
+
+#pragma unroll
+    for (int ni = n_begin; ni < n_begin + UNROLL_N; ni++) {
+      int const nii = ni - n_begin;
+#pragma unroll
+      for (int ki = 0; ki < K; ki++) {
+        fmha::stsm(smem_write_loc_ + ki * BYTES_PER_ROW * D, regs_[nii][ki]);
+      }
+      if (WARPS_4x1x1) {  // D=64, 1 warp in N.
+        smem_write_loc_ += 16 * BYTES_PER_ROW;
+      } else if (WARPS_4x1x2) {  // D=64, 2 warps in N.
+        smem_write_loc_ += 32 * BYTES_PER_ROW;
+      } else {
+        assert(false);
+      }
+    }
+  }
+};
+
+template <typename Traits, typename Cta_tile, int UNROLL_N>
+struct Transposer<Traits, Cta_tile, 64, UNROLL_N> {
+  static_assert(Cta_tile::K % 64 == 0);
+
+  enum {
+    WARPS_M = Cta_tile::WARPS_M,
+    WARPS_N = Cta_tile::WARPS_N,
+    WARPS_K = Cta_tile::WARPS_K,
+  };
+
+  enum {
+    WARPS_4x1x1 = (WARPS_M == 4 && WARPS_N == 1 && WARPS_K == 1),
+    WARPS_4x1x2 = (WARPS_M == 4 && WARPS_N == 1 && WARPS_K == 2),
+  };
+
+  enum { BYTES_PER_LDS = 16 };
+
+  // D=64 and 4 warps.
+  // Per warp we load 32 rows x 16 columns with LDSM.Tx4, 128 rows per CTA.
+  enum { S = Cta_tile::K >= 128 ? 128 : Cta_tile::K };  // The sequence length.
+
+  enum { D = Cta_tile::N >= 128 ? 128 : Cta_tile::N };  // The head dimension.
+
+  static_assert(S % 64 == 0);
+  static_assert(WARPS_4x1x1);
+  static_assert(D % 32 == 0);
+
+  static_assert(S == 64 && D == 128);
+
+  // Two warps in S dim.
+  enum { ROWS_PER_LDSM_PER_CTA_WITHOUT_PACKING = 64 };  // LDSMx4
+
+  enum { BYTES_PER_ROW = 128 };
+
+  enum { ROW_PACKING = Div_up<BYTES_PER_ROW, D * sizeof(typename Traits::B_type)>::VALUE };
+
+  enum {
+    ROWS_PER_LDSM_PER_CTA = ROWS_PER_LDSM_PER_CTA_WITHOUT_PACKING / ROW_PACKING
+  };  // due to row_packing
+
+  // The number of loads in K dimension.
+  enum { K = S / ROWS_PER_LDSM_PER_CTA_WITHOUT_PACKING };
+
+  // The number of loads in the D dimension. Use two warps in D dim.
+  enum { N = D / 32 };
+
+  uint4 regs_[UNROLL_N][K];
+
+  uint32_t read_offset_;
+  uint32_t write_offset_;
+  uint32_t smem_read_loc_;
+  uint32_t smem_write_loc_;
+
+  inline __device__ Transposer(int tidx) {
+    int read_row, read_col;
+
+    if (WARPS_4x1x1 && N == 1) {  // D=32, 2 warps  in N
+      read_row = (tidx & 0x20) / 4 + (tidx & 0x1c) / 4;
+      read_col = (tidx & 0x03) * 2;
+      read_col ^= (read_row & 0x01);
+      read_col ^= ((tidx & 0x40) / 64);
+    } else if (WARPS_4x1x1 && N == 2) {  // D=64, 2 warps  in N
+      read_row = (tidx & 0x20) / 2 + (tidx & 0x1e) / 2;
+      read_col = (tidx & 0x01) * 4 + (tidx & 0x06) / 2;
+      read_col ^= ((tidx & 0x40) / 64);
+    } else if (WARPS_4x1x1 && N == 4) {  // D=128, 2 warps  in N
+      read_row = (tidx & 0x3f);
+      read_col = (tidx & 0x07);
+      read_col ^= ((tidx & 0x40) / 64);
+    } else {
+      assert(false);
+    }
+
+    read_offset_ = read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+
+    // static_assert(ROWS_PER_LDSM_PER_CTA == 32);
+    // constexpr int ROWS_PER_XOR_PATTERN = 4;
+    // constexpr int ROWS_PER_XOR_PATTERN = fmha::Rows_per_xor_pattern_ampere_b<Traits, S>::VALUE;
+
+    int row, col;
+    if (WARPS_4x1x1) {
+      row = (tidx & 0x40) / 4 + (tidx & 0x10) / 2 + (tidx & 0x07);
+      col = (tidx & 0x20) / 16 + (tidx & 0x08) / 8;
+      col = col + (row % 2) * 4;
+      row = row / 2;
+      col = col ^ (row % 4);
+    } else {
+      assert(false);
+    }
+    write_offset_ = row * BYTES_PER_ROW + col * BYTES_PER_LDS;
+  };
+
+  inline __device__ void transpose(int tidx, uint32_t smem) { transpose_<true>(tidx, smem, smem); }
+
+  template <bool SYNC>
+  inline __device__ void transpose_(uint32_t smem_src, uint32_t smem_dst) {
+#pragma unroll
+    for (int n_begin = 0; n_begin < N; n_begin += UNROLL_N) {
+      transpose_ldmatrix(n_begin, smem_src);
+      transpose_stmatrix<SYNC>(n_begin, smem_dst);
+    }
+  }
+
+  inline __device__ void transpose_ldmatrix(int n_begin, uint32_t smem_src) {
+    static_assert(N % UNROLL_N == 0, "");
+
+    uint4 tmp[UNROLL_N][K];
+    if (n_begin == 0) {
+      smem_read_loc_ = smem_src + read_offset_;
+    }
+#pragma unroll
+    for (int ni = n_begin; ni < n_begin + UNROLL_N; ni++) {
+#pragma unroll
+      for (int ki = 0; ki < K; ki++) {
+        int const nii = ni - n_begin;
+        fmha::ldsmt(tmp[ni][ki], smem_read_loc_ + ki * ROWS_PER_LDSM_PER_CTA * BYTES_PER_ROW);
+      }
+
+      if (WARPS_4x1x1 && N == 2) {  // D=64, 2 warps in N
+        smem_read_loc_ ^= 32;
+      } else if (WARPS_4x1x1 && N == 4) {  // D=128, 2 warps in N
+        smem_read_loc_ ^= (ni % 2 == 1 ? 3 * 32 : 32);
+      } else if (N != 1) {
+        assert(false);
+      }
+    }
+
+#pragma unroll
+    for (int ni = n_begin; ni < n_begin + UNROLL_N; ni++) {
+      int const nii = ni - n_begin;
+#pragma unroll
+      for (int ki = 0; ki < K; ki++) {
+        fmha::swizzle_rows(regs_[nii][ki].x, regs_[nii][ki].z, tmp[nii][ki].x,
+                           tmp[nii][ki].y);  // PRMT 0+1
+        fmha::swizzle_rows(regs_[nii][ki].y, regs_[nii][ki].w, tmp[nii][ki].z,
+                           tmp[nii][ki].w);  // PRMT 2+3
+      }
+    }
+  }
+
+  template <bool SYNC>
+  inline __device__ void transpose_stmatrix(int n_begin, uint32_t smem_dst) {
+    // After LDSM.Tx4 registers hold 2x2 elts:
+    // [00, 01]
+    // [10, 11]
+    // With row offsets
+    // x: + 0
+    // y: + 8
+    // z: +16 (g)
+    // w: +24 (o)
+    //
+    // After PRMT 0, the :
+    // [00, 01] [80, 81] => x: [00, 10, 80, 90], i.e. col 0
+    // [10, 11] [90, 91] => z: [01, 11, 81, 91], i.e. col 1
+    //
+    // [g0, g1] [o0, o1] => y: [g0, h0, o0, p0], i.e. col 0
+    // [h0, h1] [p0, p1] => w: [g1, h1, o1, p1], i.e. col 1
+    //
+    // Therefore, when looking at the transpose, quad q holds cols 2 * q + [0, 1], i.e.
+    // - quad 0 holds cols 0, 1
+    // - quad 1 holds cols 2, 3
+    // - etc.
+    //
+    // This fits with the accumulator layout, since N strides in steps of 8 per thread.
+
+    if (SYNC) {
+      __syncthreads();  // LDSM.T done. We should now have a D x S tile in registers. SMEM can be
+                        // written.
+    }
+
+    if (n_begin == 0) {
+      smem_write_loc_ = smem_dst + write_offset_;
+    }
+
+#pragma unroll
+    for (int ni = n_begin; ni < n_begin + UNROLL_N; ni++) {
+      int const nii = ni - n_begin;
+#pragma unroll
+      for (int ki = 0; ki < K; ki++) {
+        fmha::stsm(smem_write_loc_ + ki * BYTES_PER_ROW * D / 2, regs_[nii][ki]);
+      }
+      if (WARPS_4x1x1) {  // D=64, 1 warp in N.
+        smem_write_loc_ += 16 * BYTES_PER_ROW;
+      } else {
+        assert(false);
+      }
+    }
+  }
+};
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The Cta_tile.
+    typename Cta_tile,
+    // The number of buffers.
+    int BUFFERS_PER_TILE,
+    // GMMA descriptor mode
+    fmha::Gmma_descriptor_mode desc_mode,
+    // USe TMA or not,
+    bool USE_TMA>
+struct Smem_tile_v_gmma {
+  static_assert(sizeof(typename Traits::B_type) == 1);
+
+  // K is the sequence length dimension (128 for GMMA)
+  enum { K_ = Cta_tile::K % 128 == 0 ? 128 : 64 };
+
+  static_assert(Cta_tile::K % K_ == 0);
+
+  // static_assert(Cta_tile::N == 128);
+  // static_assert(K_ == 128);
+  // static_assert(BUFFERS_PER_TILE == 2);
+
+  using Cta_tile_gmma_ =
+      typename Traits::template Cta_tile<Cta_tile::M, Cta_tile::N, K_, Cta_tile::WARP_GROUP_M,
+                                         Cta_tile::WARP_GROUP_N, Cta_tile::WARP_GROUP_K>;
+
+  // TODO Swizzle_32B?
+  static constexpr fmha::Gmma_descriptor_mode GMMA_DESC_MODE_V =
+      Cta_tile_gmma_::K * sizeof(typename Traits::B_type) >= 128
+          ? fmha::Gmma_descriptor_mode::SWIZZLE_128B
+          : fmha::Gmma_descriptor_mode::SWIZZLE_64B;
+
+  static_assert(
+      (Cta_tile::K % 128 == 0 && GMMA_DESC_MODE_V == fmha::Gmma_descriptor_mode::SWIZZLE_128B) ||
+      (Cta_tile::K % 64 == 0 && GMMA_DESC_MODE_V == fmha::Gmma_descriptor_mode::SWIZZLE_64B));
+
+  enum { NUM_KGROUPS = Cta_tile::K / Cta_tile_gmma_::K };
+
+  static_assert(NUM_KGROUPS * Cta_tile_gmma_::K == Cta_tile::K);
+
+  enum { BYTES_PER_STS = 16 };
+
+  // The compute tile only requires static information from Smem_tile_v and accesses SMEM directly
+  // through GMMA. Hence, we declare a SxD column major matrix in SMEM and have to make sure at
+  // runtime that the data is transposed. Note that for K > 128, we are using two buffers per tile,
+  // which we have to fill accordingly.
+  using Base_ = fmha::Smem_tile_hopper_b<Traits, Cta_tile_gmma_, fmha::Col, BYTES_PER_STS,
+                                         BUFFERS_PER_TILE * NUM_KGROUPS, GMMA_DESC_MODE_V, USE_TMA>;
+
+  // Split D or not, which influences the GMMA_GROUP_SMEM_DISTANCE, and BYTES_PER_BUFFER_NO_4LSB.
+  // Split-d smem view (2 split D, and 3 buffers): d0, d0, d0, d1, d1, d1.
+  // The group distance would be number_of_buffers * buffer_size.
+  // The buffer size is the size for split-d.
+  static constexpr size_t GMMA_GROUP_SMEM_DISTANCE =
+      Base_::GMMA_GROUP_SMEM_DISTANCE * BUFFERS_PER_TILE;
+  static constexpr size_t BYTES_PER_BUFFER_NO_4LSB = Base_::BYTES_PER_BUFFER_NO_4LSB;
+
+  using Gmma_descriptor = typename Base_::Gmma_descriptor;
+
+  struct Base : public Base_ {
+    using Transposer = Transposer<Traits, Cta_tile, K_>;
+    static_assert(USE_TMA == false);
+    static constexpr bool TRANSPOSE = true;
+
+    enum { NUM_KGROUPS = Cta_tile::K / Cta_tile_gmma_::K };
+
+    enum { ROWS_PER_XOR_PATTERN = fmha::Rows_per_xor_pattern_ampere_b<Traits, Cta_tile::N>::VALUE };
+
+    using Descriptor = typename Base_::Gmma_descriptor;
+
+    // Delegate all the stores to the Row-Major Smem_tile.
+    using Store_delegate = Smem_tile_without_skews<Cta_tile, Cta_tile::K, Cta_tile::N, 8,
+                                                   BYTES_PER_STS, 1, 0, ROWS_PER_XOR_PATTERN, 1>;
+
+    using Store_type = typename Store_delegate::Store_type;
+
+    enum { S = Cta_tile::K };
+
+    // static_assert(Descriptor::BYTES_PER_LEADING_DIM == 128);
+    // static_assert(Descriptor::STRIDE_BYTE_OFFSET == K_ * 8 / 16);  // 128 * 8 / 16
+    // static_assert(Descriptor::TRANS_MODE == fmha::Gmma_descriptor_transpose::NOTRANS);
+    // static_assert(Base::BYTES_PER_TILE == S * 64);
+    // static_assert(!Descriptor::LEADING_BYTE_OFFSET_NEEDED);
+    // static_assert(Descriptor::LEADING_BYTE_OFFSET == 128 * 64 / 16);
+    // static_assert(Descriptor::BYTES_PER_DESC_NO_4LSB == 32 * 1 / 16);
+    // static_assert(Descriptor::BYTES_DESC_INC_BOUNDARY_NO_4LSB == (K_ / 32 - 1) * 2);
+    // static_assert(Base::BYTES_PER_BUFFER_NO_4LSB == K_ * 64 / 16);
+    // static_assert(Base::GMMA_GROUP_SMEM_DISTANCE == 128 * 128 * 2);
+    // static_assert(Base::BYTES_PER_BUFFER_NO_4LSB == 128 * 128);
+
+    // static_assert(Store_delegate::N_WITH_PADDING == 64);
+    // static_assert(Store_delegate::ROWS_PER_XOR_PATTERN == 4);
+    // static_assert(Store_delegate::BYTES_PER_ROW_BEFORE_PACKING == 64);
+    // static_assert(Store_delegate::ROWS == S / 2);
+    // static_assert(Store_delegate::BYTES_PER_ROW == 128);
+
+    // Number of rows a warp loads per LDSMx4
+    enum { ROWS_PER_LDSM = 4 * 8 };
+
+    enum { ROWS_PER_LDSM_PER_CTA = ROWS_PER_LDSM * Cta_tile::WARPS_M };
+
+    static_assert(Cta_tile::WARPS_M == 4);
+
+    enum { LDSMS = Cta_tile::K / ROWS_PER_LDSM_PER_CTA };
+
+    // TODO we're assigning all rows loaded by a warp group (128 per CTA) to the K dimension.
+    // This only works for K a multiple of 128.
+    // For S=192, we want 3 blocks of 64xD.
+    // static_assert(LDSMS * ROWS_PER_LDSM_PER_CTA == Cta_tile::K);
+
+    static_assert(LDSMS == S / 128);
+
+    enum { BYTES_PER_LDS = 16 };
+
+    enum { BYTES_PER_ROW = Store_delegate::BYTES_PER_ROW };
+
+    enum {
+      WARPS_M = Cta_tile::WARPS_M,
+      WARPS_N = Cta_tile::WARPS_N,
+      WARPS_K = Cta_tile::WARPS_K,
+    };
+
+    enum {
+      WARPS_4x1x1 = (WARPS_M == 4 && WARPS_N == 1 && WARPS_K == 1),
+      WARPS_4x1x2 = (WARPS_M == 4 && WARPS_N == 1 && WARPS_K == 2),
+    };
+
+    inline __device__ Base(void* smem, int tidx)
+        : Base_(smem, tidx), delegate(smem, tidx), transposer(tidx) {}
+
+    // Store to the tile in shared memory.
+    template <int N>
+    inline __device__ void store(Store_type const (&data)[N]) {
+      uint32_t smem_ptrs[N];
+      delegate.compute_store_pointers(smem_ptrs);
+      sts(smem_ptrs, data);
+    }
+
+    // Store to the tile in shared memory.
+    template <int N, int M>
+    inline __device__ void store(Store_type const (&data)[N], uint32_t (&preds)[M]) {
+      uint32_t smem_ptrs[N];
+      delegate.compute_store_pointers(smem_ptrs);
+      sts(smem_ptrs, data, preds);
+    }
+
+    // Store to the tile in shared memory.
+    template <int N>
+    inline __device__ void store(Store_type const (&data)[N], uint32_t preds) {
+      delegate.store(data, preds);
+    }
+
+    // Store to the tile in shared memory.
+    template <int N, int M>
+    inline __device__ void store(void const* (&gmem_ptrs)[N], uint32_t (&preds)[M]) {
+      uint32_t smem_ptrs[N];
+      delegate.compute_store_pointers<N>(smem_ptrs);
+      ldgsts<N, M>(smem_ptrs, gmem_ptrs, preds);
+    }
+
+    // Store to the tile in shared memory.
+    template <int N>
+    inline __device__ void store(void const* (&gmem_ptrs)[N], uint32_t preds, uint64_t = 0) {
+      uint32_t tmp[1] = {preds};
+      delegate.store(gmem_ptrs, tmp);
+    }
+
+    // Store to the tile in shared memory.
+    template <int N>
+    inline __device__ void store(void const* (&gmem_ptrs)[N], uint32_t preds) {
+      uint32_t tmp[1] = {preds};
+      delegate.store(gmem_ptrs, tmp);
+    }
+
+    // Initial offset (via tidx) has been moved to ctor
+    inline __device__ void transpose_tile(int /* tidx */) { transposer.transpose(0, this->smem_); }
+
+    template <int UNROLL_N = Transposer::N>
+    inline __device__ void transpose_tile(uint32_t smem_src, uint32_t smem_dst) {
+      transposer.template transpose_<false, UNROLL_N>(smem_src, smem_dst);
+    }
+
+    inline __device__ void transpose_tile_ldmatrix(int, uint32_t smem) {
+      transposer.transpose_ldmatrix(0, smem);
+    }
+
+    inline __device__ void transpose_tile_stmatrix(int, uint32_t smem) {
+      transposer.template transpose_stmatrix<false>(0, smem);
+    }
+
+    inline __device__ void transpose_tile_128(int tidx) {
+      // D=64 and 4 warps.
+      // Per warp we load 32 rows x 16 columns with LDSM.Tx4, 128 rows per CTA.
+      constexpr int S = Cta_tile::K;  // The sequence length.
+      constexpr int D = Cta_tile::N;  // The head dimension.
+      // static_assert(S == 256);
+      static_assert(D == 64);
+      // static_assert(S % 128 == 0);
+      static_assert(WARPS_4x1x1 || WARPS_4x1x2);
+      static_assert(D % (16 * WARPS_K) == 0);
+
+      constexpr int ROWS_PER_LDSM_PER_CTA_WITHOUT_PACKING = 128;  // LDSMx4
+      constexpr int BYTES_PER_ROW = 128;
+      constexpr int ROW_PACKING = BYTES_PER_ROW / (D * sizeof(Traits::B_type));
+
+      // The number of loads in K dimension.
+      constexpr int K = S / ROWS_PER_LDSM_PER_CTA_WITHOUT_PACKING;
+      // static_assert(K * ROWS_PER_LDSM_PER_CTA_WITHOUT_PACKING == S);
+      // static_assert(K == 3);
+      //  The number of loads in the D dimension.
+      constexpr int N = D / (16 * WARPS_K);
+      static_assert(N * 16 * WARPS_K == D);
+
+      int read_row, read_col;
+
+      if (WARPS_4x1x1 && N == 4) {  // D=64, 1 warp  in N
+        read_row = (tidx & 0xe0) / 2 + (tidx & 0x1e) / 2;
+        read_col = (tidx & 0x01) * 4 + (tidx & 0x06) / 2;
+      } else if (WARPS_4x1x2 && N == 2) {  // D=64, 2 warps in N
+        read_row = (tidx & 0x60) / 2 + (tidx & 0x1e) / 2;
+        read_col = (tidx & 0x01) * 4 + (tidx & 0x06) / 2;
+        // For two warpgroups we do two steps in N at once.
+        read_col ^= (tidx & 0x80) / 128;
+      } else {
+        assert(false);
+      }
+
+      uint32_t offset = read_row * BYTES_PER_ROW + read_col * 16;
+
+      constexpr int ROWS_PER_LDSM_PER_CTA =
+          ROWS_PER_LDSM_PER_CTA_WITHOUT_PACKING / ROW_PACKING;  // due to row_packing
+
+      uint4 tmp[N][K];
+      uint32_t smem_tmp = this->smem_;  //__nvvm_get_smem_pointer(v_smem_) ;
+      uint32_t smem_loc = smem_tmp + offset;
+
+#pragma unroll
+      for (int ni = 0; ni < N; ni++) {
+#pragma unroll
+        for (int ki = 0; ki < K; ki++) {
+          fmha::ldsmt(tmp[ni][ki], smem_loc + ki * ROWS_PER_LDSM_PER_CTA * BYTES_PER_ROW);
+        }
+
+        if (WARPS_4x1x1 && N == 4) {  // D=64, 1 warp  in N
+          smem_loc ^= (ni % 2 == 0 ? 1 : 3) * 16;
+        } else if (WARPS_4x1x2 && N == 2) {  // D=64, 2 warps in N
+          smem_loc ^= 32;
+        } else {
+          assert(false);
+        }
+      }
+
+      uint4 regs[N][K];
+
+#pragma unroll
+      for (int ni = 0; ni < N; ni++) {
+#pragma unroll
+        for (int ki = 0; ki < K; ki++) {
+          fmha::swizzle_rows(regs[ni][ki].x, regs[ni][ki].z, tmp[ni][ki].x,
+                             tmp[ni][ki].y);  // PRMT 0+1
+          fmha::swizzle_rows(regs[ni][ki].y, regs[ni][ki].w, tmp[ni][ki].z,
+                             tmp[ni][ki].w);  // PRMT 2+3
+        }
+      }
+
+      // After LDSM.Tx4 registers hold 2x2 elts:
+      // [00, 01]
+      // [10, 11]
+      // With row offsets
+      // x: + 0
+      // y: + 8
+      // z: +16 (g)
+      // w: +24 (o)
+      //
+      // After PRMT 0, the :
+      // [00, 01] [80, 81] => x: [00, 10, 80, 90], i.e. col 0
+      // [10, 11] [90, 91] => z: [01, 11, 81, 91], i.e. col 1
+      //
+      // [g0, g1] [o0, o1] => y: [g0, h0, o0, p0], i.e. col 0
+      // [h0, h1] [p0, p1] => w: [g1, h1, o1, p1], i.e. col 1
+      //
+      // Therefore, when looking at the transpose, quad q holds cols 2 * q + [0, 1], i.e.
+      // - quad 0 holds cols 0, 1
+      // - quad 1 holds cols 2, 3
+      // - etc.
+      //
+      // This fits with the accumulator layout, since N strides in steps of 8 per thread.
+
+      __syncthreads();  // LDSM.T done. We should now have a D x S tile in registers. SMEM can be
+                        // written.
+      constexpr int ROWS_PER_XOR_PATTERN = fmha::Rows_per_xor_pattern_ampere_b<Traits, S>::VALUE;
+      static_assert(ROWS_PER_XOR_PATTERN == 8);
+
+      int row, col;
+      if (WARPS_4x1x1) {
+        row = (tidx & 0x10) / 2 + (tidx & 0x07);
+        col = (tidx & 0x60) / 16 + (tidx & 0x08) / 8;
+      } else if (WARPS_4x1x2) {
+        // Same as above, with second warp group writing next 16 rows.
+        row = (tidx & 0x80) / 8 + (tidx & 0x10) / 2 + (tidx & 0x07);
+        col = (tidx & 0x60) / 16 + (tidx & 0x08) / 8;
+      } else {
+        assert(false);
+      }
+      col ^= (row & 0x07);
+      int dst = smem_tmp + row * BYTES_PER_ROW + col * BYTES_PER_LDS;
+
+#pragma unroll
+      for (int ni = 0; ni < N; ni++) {
+#pragma unroll
+        for (int ki = 0; ki < K; ki++) {
+          fmha::stsm(dst + ki * BYTES_PER_ROW * D, regs[ni][ki]);
+        }
+        if (WARPS_4x1x1 && N == 4) {  // D=64, 1 warp in N.
+          dst += 16 * BYTES_PER_ROW;
+        } else if (WARPS_4x1x2 && N == 2) {  // D=64, 2 warps in N.
+          dst += 32 * BYTES_PER_ROW;
+        } else {
+          assert(false);
+        }
+      }
+    }
+
+    Store_delegate delegate;
+    Transposer transposer;
+  };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF,
+          // The description of the tile computed by this CTA.
+          typename Cta_tile, int BUFFERS_PER_TILE,
+          // GMMA descriptor mode
+          fmha::Gmma_descriptor_mode desc_mode,
+          // USe TMA or not,
+          bool USE_TMA>
+struct Smem_tile_v<
+    fmha::Hopper_qgmma_e4m3_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+    BUFFERS_PER_TILE, desc_mode, USE_TMA>
+    : public Smem_tile_v_gmma<
+          fmha::Hopper_qgmma_e4m3_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+          Cta_tile, BUFFERS_PER_TILE, desc_mode, USE_TMA>::Base {
+  using Traits = fmha::Hopper_qgmma_e4m3_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+
+  using Base =
+      typename fmha::Smem_tile_v_gmma<Traits, Cta_tile, BUFFERS_PER_TILE, desc_mode, USE_TMA>::Base;
+
+  inline __device__ Smem_tile_v(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF,
+          // The description of the tile computed by this CTA.
+          typename Cta_tile, int BUFFERS_PER_TILE,
+          // GMMA descriptor mode
+          fmha::Gmma_descriptor_mode desc_mode,
+          // USe TMA or not,
+          bool USE_TMA>
+struct Smem_tile_v<
+    fmha::Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile,
+    BUFFERS_PER_TILE, desc_mode, USE_TMA>
+    : public Smem_tile_v_gmma<
+          fmha::Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+          Cta_tile, BUFFERS_PER_TILE, desc_mode, USE_TMA>::Base {
+  using Traits = fmha::Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+
+  using Base =
+      typename fmha::Smem_tile_v_gmma<Traits, Cta_tile, BUFFERS_PER_TILE, desc_mode, USE_TMA>::Base;
+
+  inline __device__ Smem_tile_v(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/hopper/smem_tile_o.h b/csrc/fmha_v2/fmha/hopper/smem_tile_o.h
new file mode 100644
index 0000000000..cd499a5f39
--- /dev/null
+++ b/csrc/fmha_v2/fmha/hopper/smem_tile_o.h
@@ -0,0 +1,325 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/smem_tile_o.h>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Smem_tile_o_dummy {
+  enum { BYTES_PER_TILE = 0 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile>
+struct Smem_tile_o_gmma_32bit_8bit : public Smem_tile_o_base_8bit_mma<Traits, Cta_tile> {
+  // The base class.
+  using Base = Smem_tile_o_base_8bit_mma<Traits, Cta_tile>;
+
+  using Mma_tile = typename Base::Mma_tile;
+  using Accumulator = typename Base::Accumulator;
+
+  enum {
+    BYTES_PER_ROW = Base::BYTES_PER_ROW,
+    BYTES_PER_ROW_WITH_PACKING = Base::BYTES_PER_ROW_WITH_PACKING,
+    LOOPS = Base::LOOPS,
+    LDS_PER_LOOP = Base::LDS_PER_LOOP,
+    ROWS_PER_LDS = Base::ROWS_PER_LDS,
+    HAS_INCOMPLETE_LDS = Base::HAS_INCOMPLETE_LDS,
+  };
+
+  // Ctor.
+  inline __device__ Smem_tile_o_gmma_32bit_8bit(void* smem, int tidx) : Base(smem, tidx) {}
+
+  // Store the accumulators.
+  inline __device__ void store(Accumulator const (&acc)[1][1], int mi) {
+    enum { M_PER_MMA = Mma_tile::M_PER_MMA_PER_CTA };
+
+    static_assert(M_PER_MMA == 64);
+    static_assert(Base::WARPS_4x1x2);
+
+    // The number of MMAs that are stored per loop iteration.
+    enum { MMAS_M_PER_LOOP = Mma_tile::MMAS_M / LOOPS };
+
+    static_assert(MMAS_M_PER_LOOP == 1);
+    static_assert(Mma_tile::MMAS_N == 1);
+    static_assert(Mma_tile::CORES_N == 8);
+    static_assert(Accumulator::NUM_REGS == Mma_tile::CORES_N / 2 * 8);
+    static_assert(BYTES_PER_ROW == 64 * 4);
+    static_assert(Cta_tile::WARPS_K == 2);
+
+    static_assert(Mma_tile::CORES_N / 2 == 4);
+
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::CORES_N / 2; ++ni) {
+#pragma unroll
+      for (int mj = 0; mj < MMAS_M_PER_LOOP; ++mj) {
+        uint4 row_0;
+        row_0.x = acc[0][0].reg(ni * 8 + 0);  // Even
+        row_0.y = acc[0][0].reg(ni * 8 + 4);  // Odd
+        row_0.z = acc[0][0].reg(ni * 8 + 1);  // Even
+        row_0.w = acc[0][0].reg(ni * 8 + 5);  // Odd
+        uint4 row_1;
+        row_1.x = acc[0][0].reg(ni * 8 + 2);  // Even
+        row_1.y = acc[0][0].reg(ni * 8 + 6);  // Odd
+        row_1.z = acc[0][0].reg(ni * 8 + 3);  // Even
+        row_1.w = acc[0][0].reg(ni * 8 + 7);  // Odd
+
+        // Regs_to_rows<Traits>::extract(acc[mi * MMAS_M_PER_LOOP + mj][ni], row_0, row_1);
+
+        // Each thread of a quad writes 16B per STS -> 64B per store. Account for 2 -> 128B.
+        int imm_0 = (mj * M_PER_MMA + 0) * BYTES_PER_ROW * Cta_tile::WARPS_K + (ni / 2) * 128;
+        int imm_1 = (mj * M_PER_MMA + 8) * BYTES_PER_ROW * Cta_tile::WARPS_K + (ni / 2) * 128;
+
+        // Store the elements.
+        fmha::sts(this->smem_write_ + imm_0, row_0);
+        fmha::sts(this->smem_write_ + imm_1, row_1);
+      }
+      // Each thread of a quad writes 16B per STS -> 64B per store.
+      if (Mma_tile::MMAS_N == 1) {
+        this->smem_write_ ^= 64;
+      } else {
+        assert(false && "Unsupported");
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M,      // GMMA instruction shape in M dim
+          int GMMA_N,      // GMMA instruction shape in N dim
+          int GMMA_K,      // GMMA instruction shape in K dim
+          bool GMMA_A_RF,  // GMMA A operand coming from RF?
+          bool GMMA_B_RF,  // GMMA B operand coming from RF?
+          typename Cta_tile>
+struct Smem_tile_o<Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile>
+    : public Hmma_smem_tile_o<
+          Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile> {
+  // The traits class.
+  using Traits = Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+  // The base class.
+  using Base = Hmma_smem_tile_o<Traits, Cta_tile>;
+
+  using Mma_tile = typename Base::Mma_tile;
+
+  using Accumulator = typename Base::Accumulator;
+
+  enum {
+    LOOPS = Base::LOOPS,
+    ROW_PACKING = Base::ROW_PACKING,
+    BYTES_PER_ROW = Base::BYTES_PER_ROW,
+  };
+
+  // Ctor.
+  inline __device__ Smem_tile_o(void* smem, int tidx) : Base(smem, tidx) {}
+
+  // Store the accumulators.
+  inline __device__ void store(Accumulator const (&acc)[1][1], int mi) {
+    enum { M_PER_MMA = Mma_tile::M_PER_MMA_PER_CTA };
+
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::CORES_N; ++ni) {
+      // The number of MMAs that are stored per loop iteration.
+      enum { MMAS_M_PER_LOOP = Mma_tile::MMAS_M / LOOPS };
+
+      static_assert(MMAS_M_PER_LOOP == 1);
+      // inplace multiples seem to be 1, 3, 1, 7, 1, 3, 1,
+      auto smem_write = this->smem_write_ ^ (ni * 16);
+// Store 1st column of the different MMAs.
+#pragma unroll
+      for (int mj = 0; mj < MMAS_M_PER_LOOP; ++mj) {
+        // Precompute the immediates to jump between rows.
+        int row_0 = (mj * M_PER_MMA + 0) * BYTES_PER_ROW * Cta_tile::WARPS_K;
+        int row_1 = (mj * M_PER_MMA + 8) * BYTES_PER_ROW * Cta_tile::WARPS_K;
+
+        // Store.
+        fmha::sts(smem_write + row_0, acc[0][0].reg(ni * 2 + 0));
+        fmha::sts(smem_write + row_1, acc[0][0].reg(ni * 2 + 1));
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M,      // GMMA instruction shape in M dim
+          int GMMA_N,      // GMMA instruction shape in N dim
+          int GMMA_K,      // GMMA instruction shape in K dim
+          bool GMMA_A_RF,  // GMMA A operand coming from RF?
+          bool GMMA_B_RF,  // GMMA B operand coming from RF?
+          typename Cta_tile>
+struct Smem_tile_o<Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile>
+    : public Hmma_smem_tile_o<
+          Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile> {
+  // The traits class.
+  using Traits = Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+  // The base class.
+  using Base = Hmma_smem_tile_o<Traits, Cta_tile>;
+
+  using Mma_tile = typename Base::Mma_tile;
+
+  using Accumulator = typename Base::Accumulator;
+
+  enum {
+    LOOPS = Base::LOOPS,
+    ROW_PACKING = Base::ROW_PACKING,
+    BYTES_PER_ROW = Base::BYTES_PER_ROW,
+  };
+
+  // Ctor.
+  inline __device__ Smem_tile_o(void* smem, int tidx) : Base(smem, tidx) {}
+
+  // Store the accumulators.
+  inline __device__ void store(Accumulator const (&acc)[1][1], int mi) {
+    enum { M_PER_MMA = Mma_tile::M_PER_MMA_PER_CTA };
+
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::CORES_N; ++ni) {
+      // The number of MMAs that are stored per loop iteration.
+      enum { MMAS_M_PER_LOOP = Mma_tile::MMAS_M / LOOPS };
+
+      static_assert(MMAS_M_PER_LOOP == 1);
+      // inplace multiples seem to be 1, 3, 1, 7, 1, 3, 1,
+      auto smem_write = this->smem_write_ ^ (ni * 16);
+// Store 1st column of the different MMAs.
+#pragma unroll
+      for (int mj = 0; mj < MMAS_M_PER_LOOP; ++mj) {
+        // Precompute the immediates to jump between rows.
+        int row_0 = (mj * M_PER_MMA + 0) * BYTES_PER_ROW * Cta_tile::WARPS_K;
+        int row_1 = (mj * M_PER_MMA + 8) * BYTES_PER_ROW * Cta_tile::WARPS_K;
+
+        uint32_t val_0 = float2_to_half2(acc[0][0].elt(2 * ni * Mma_tile::CORES_M + 0),
+                                         acc[0][0].elt(2 * ni * Mma_tile::CORES_M + 1));
+
+        uint32_t val_1 = float2_to_half2(acc[0][0].elt(2 * ni * Mma_tile::CORES_M + 2),
+                                         acc[0][0].elt(2 * ni * Mma_tile::CORES_M + 3));
+
+        // Store.
+        fmha::sts(smem_write + row_0, val_0);
+        fmha::sts(smem_write + row_1, val_1);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M,      // GMMA instruction shape in M dim
+          int GMMA_N,      // GMMA instruction shape in N dim
+          int GMMA_K,      // GMMA instruction shape in K dim
+          bool GMMA_A_RF,  // GMMA A operand coming from RF?
+          bool GMMA_B_RF,  // GMMA B operand coming from RF?
+          typename Cta_tile>
+struct Smem_tile_o<Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile>
+    : public Hmma_smem_tile_o<
+          Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile> {
+  // The traits class.
+  using Traits = Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+  // The base class.
+  using Base = Hmma_smem_tile_o<Traits, Cta_tile>;
+
+  using Mma_tile = typename Base::Mma_tile;
+
+  using Accumulator = typename Base::Accumulator;
+
+  enum {
+    LOOPS = Base::LOOPS,
+    ROW_PACKING = Base::ROW_PACKING,
+    BYTES_PER_ROW = Base::BYTES_PER_ROW,
+  };
+
+  // Ctor.
+  inline __device__ Smem_tile_o(void* smem, int tidx) : Base(smem, tidx) {}
+
+  // Convert fp32 to bf16, and store the accumulators.
+  inline __device__ void store(Accumulator const (&acc)[1][1], int mi) {
+    enum { M_PER_MMA = Mma_tile::M_PER_MMA_PER_CTA };
+
+    static_assert(Mma_tile::CORES_M == 2);
+
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::CORES_N; ++ni) {
+      // The number of MMAs that are stored per loop iteration.
+      enum { MMAS_M_PER_LOOP = Mma_tile::MMAS_M / LOOPS };
+
+      static_assert(MMAS_M_PER_LOOP == 1);
+      // inplace multiples seem to be 1, 3, 1, 7, 1, 3, 1,
+      auto smem_write = this->smem_write_ ^ (ni * 16);
+// Store 1st column of the different MMAs.
+#pragma unroll
+      for (int mj = 0; mj < MMAS_M_PER_LOOP; ++mj) {
+        // Precompute the immediates to jump between rows.
+        int row_0 = (mj * M_PER_MMA + 0) * BYTES_PER_ROW * Cta_tile::WARPS_K;
+        int row_1 = (mj * M_PER_MMA + 8) * BYTES_PER_ROW * Cta_tile::WARPS_K;
+
+        uint32_t val_0 = float2_to_bf16_x2(acc[0][0].elt(2 * ni * Mma_tile::CORES_M + 0),
+                                           acc[0][0].elt(2 * ni * Mma_tile::CORES_M + 1));
+
+        uint32_t val_1 = float2_to_bf16_x2(acc[0][0].elt(2 * ni * Mma_tile::CORES_M + 2),
+                                           acc[0][0].elt(2 * ni * Mma_tile::CORES_M + 3));
+
+        // Store.
+        fmha::sts(smem_write + row_0, val_0);
+        fmha::sts(smem_write + row_1, val_1);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M,      // GMMA instruction shape in M dim
+          int GMMA_N,      // GMMA instruction shape in N dim
+          int GMMA_K,      // GMMA instruction shape in K dim
+          bool GMMA_A_RF,  // GMMA A operand coming from RF?
+          bool GMMA_B_RF,  // GMMA B operand coming from RF?
+          typename Cta_tile>
+struct Smem_tile_o<Hopper_qgmma_e4m3_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+                   Cta_tile>
+    : public Smem_tile_o_gmma_32bit_8bit<
+          Hopper_qgmma_e4m3_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile> {
+  // The traits class.
+  using Traits = Hopper_qgmma_e4m3_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+  // The base class.
+  using Base = Smem_tile_o_gmma_32bit_8bit<Traits, Cta_tile>;
+
+  // Ctor.
+  inline __device__ Smem_tile_o(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+template <int GMMA_M,      // GMMA instruction shape in M dim
+          int GMMA_N,      // GMMA instruction shape in N dim
+          int GMMA_K,      // GMMA instruction shape in K dim
+          bool GMMA_A_RF,  // GMMA A operand coming from RF?
+          bool GMMA_B_RF,  // GMMA B operand coming from RF?
+          typename Cta_tile>
+struct Smem_tile_o<Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+                   Cta_tile>
+    : public Smem_tile_o_gmma_32bit_8bit<
+          Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile> {
+  // The traits class.
+  using Traits = Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+  // The base class.
+  using Base = Smem_tile_o_gmma_32bit_8bit<Traits, Cta_tile>;
+
+  // Ctor.
+  inline __device__ Smem_tile_o(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/hopper/tma_descriptor.h b/csrc/fmha_v2/fmha/hopper/tma_descriptor.h
new file mode 100644
index 0000000000..22071f3585
--- /dev/null
+++ b/csrc/fmha_v2/fmha/hopper/tma_descriptor.h
@@ -0,0 +1,348 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+#include <fmha/hopper/tma_types.h>
+
+namespace fmha {
+
+// manage TMA descriptor host code.
+// allocate, deallocate and manipulate tma desc in the host
+// copy the tma descriptor from host code to device code
+// Multiple TMA desc, one desc per batch.
+// Device desc ptr should be allocated outside the class and reused
+template <
+    // number of dimensions.
+    int NUM_DIMS>
+class Multiple_tma_descriptor {
+ public:
+  // ctor
+  Multiple_tma_descriptor(int batch_size_) : batch_size(batch_size_) {
+    if (batch_size > 0) {
+      // allocate host memory
+      desc_ptr_h = new cudaTmaDesc[batch_size];
+      // make sure all bit fields are zeros.
+      memset(desc_ptr_h, 0, sizeof(cudaTmaDesc) * batch_size);
+    }
+  }
+
+  // ctor
+  Multiple_tma_descriptor() = default;
+
+  // destructor.
+  ~Multiple_tma_descriptor() {
+    if (batch_size > 0) {
+      // deallocate host memory
+      delete[] desc_ptr_h;
+    }
+  }
+
+  // set the desctriptor.
+  int set_tma_desctriptor(
+      // ptr to gmem
+      void const* gmem_ptr,
+      // format is really data_type in TMA terminology.
+      cudaTmaDescFormat format,
+      // interleave mode.
+      cudaTmaDescInterleave interleave,
+      // swizzle mode.
+      cudaTmaDescSwizzle swizzle,
+      // L2 sector promotion.
+      cudaTmaDescPromotion promotion, uint32_t const (&tensor_size_array)[NUM_DIMS],
+      uint64_t const (&tensor_stride_array)[NUM_DIMS - 1],
+      uint32_t const (&traversal_stride_array)[NUM_DIMS],
+      uint32_t const (&box_size_array)[NUM_DIMS],
+      // OOB fill mode.
+      uint32_t fill_oob,
+      // FP32 to TF32 conversion.
+      uint32_t round_to_tf32,
+      // index to desc.
+      int batch_idx) {
+    set_tensor_common_0(&desc_ptr_h[batch_idx], reinterpret_cast<uint64_t>(gmem_ptr));
+    set_tensor_common_1(&desc_ptr_h[batch_idx], TILED, NUM_DIMS, format, interleave, swizzle,
+                        fill_oob, round_to_tf32, promotion);
+
+    set_tensor_stride(&desc_ptr_h[batch_idx], tensor_stride_array);
+    set_tensor_size(&desc_ptr_h[batch_idx], tensor_size_array);
+
+    set_traversal_stride_tiled(&desc_ptr_h[batch_idx], traversal_stride_array);
+
+    set_box_size(&desc_ptr_h[batch_idx], box_size_array);
+    return 0;
+  }
+
+  // set the desctriptor.
+  int set_tma_desctriptor(
+      // ptr to gmem
+      void const* gmem_ptr,
+      // format is really data_type in TMA terminology.
+      cudaTmaDescFormat format,
+      // interleave mode.
+      cudaTmaDescInterleave interleave,
+      // swizzle mode.
+      cudaTmaDescSwizzle swizzle,
+      // L2 sector promotion.
+      cudaTmaDescPromotion promotion, uint32_t const (&tensor_size_array)[NUM_DIMS],
+      uint64_t const (&tensor_stride_array)[NUM_DIMS - 1],
+      uint32_t const (&traversal_stride_array)[NUM_DIMS],
+      uint32_t const (&box_size_array)[NUM_DIMS],
+      // OOB fill mode.
+      uint32_t fill_oob,
+      // FP32 to TF32 conversion.
+      uint32_t round_to_tf32,
+      // index to desc.
+      cudaTmaDesc* desc_ptr = nullptr) {
+    set_tensor_common_0(desc_ptr, reinterpret_cast<uint64_t>(gmem_ptr));
+    set_tensor_common_1(desc_ptr, TILED, NUM_DIMS, format, interleave, swizzle, fill_oob,
+                        round_to_tf32, promotion);
+
+    set_tensor_stride(desc_ptr, tensor_stride_array);
+    set_tensor_size(desc_ptr, tensor_size_array);
+
+    set_traversal_stride_tiled(desc_ptr, traversal_stride_array);
+
+    set_box_size(desc_ptr, box_size_array);
+    return 0;
+  }
+
+  // copy the desc to device memory
+  void copy_to_device(void* desc_ptr_d_, cudaStream_t stream = 0) {
+    FMHA_CHECK_CUDA(cudaMemcpy(desc_ptr_d_, desc_ptr_h, TMA_DESC_SIZE_IN_BYTE * batch_size,
+                               cudaMemcpyHostToDevice));
+  }
+
+  // get desc in host
+  cudaTmaDesc get_desc_in_host(int batch_idx) const { return desc_ptr_h[batch_idx]; }
+
+ private:
+  void set_tensor_common_0(cudaTmaDesc* p_desc, uint64_t addr) {
+    cudaTmaDescTiled* desc = reinterpret_cast<cudaTmaDescTiled*>(p_desc);
+    desc->tensor_common0 = 0;
+    desc->tensor_common0 |= (addr);
+  }
+
+  void set_tensor_common_1(cudaTmaDesc* p_desc, cudaTmaDescType desc_type, uint32_t dims,
+                           cudaTmaDescFormat format, cudaTmaDescInterleave interleave,
+                           cudaTmaDescSwizzle swizzle, uint32_t fill, uint32_t f32_to_tf32,
+                           cudaTmaDescPromotion promotion) {
+    cudaTmaDescTiled* desc = reinterpret_cast<cudaTmaDescTiled*>(p_desc);
+
+    desc->tensor_common1 = 0;
+    desc->tensor_common1 |= desc_type == TILED ? 0x0 : 0x1;
+
+    constexpr uint32_t VERSION_SHIFT = 1;
+    constexpr uint32_t VERSION_BITS = 3;
+    desc->tensor_common1 |= (1u << VERSION_SHIFT);
+
+    constexpr uint32_t DIM_BITS = 3;
+    constexpr uint32_t DIM_SHIFT = VERSION_SHIFT + VERSION_BITS;
+    constexpr uint32_t DIM_MASK = (1u << DIM_BITS) - 1;
+    desc->tensor_common1 |= ((dims - 1) & DIM_MASK) << DIM_SHIFT;
+
+    constexpr uint32_t FORMAT_BITS = 4;
+    constexpr uint32_t FORMAT_SHIFT = DIM_SHIFT + DIM_BITS;
+    constexpr uint32_t FORMAT_MASK = (1u << FORMAT_BITS) - 1;
+    desc->tensor_common1 |= (static_cast<uint32_t>(format) & FORMAT_MASK) << FORMAT_SHIFT;
+
+    constexpr uint32_t INTERLEAVE_BITS = 2;
+    constexpr uint32_t INTERLEAVE_SHIFT = FORMAT_SHIFT + FORMAT_BITS;
+    constexpr uint32_t INTERLEAVE_MASK = (1u << INTERLEAVE_BITS) - 1;
+    desc->tensor_common1 |= (static_cast<uint32_t>(interleave) & INTERLEAVE_MASK)
+                            << INTERLEAVE_SHIFT;
+
+    constexpr uint32_t SWIZZLE_BITS = 2;
+    constexpr uint32_t SWIZZLE_SHIFT = INTERLEAVE_SHIFT + INTERLEAVE_BITS;
+    constexpr uint32_t SWIZZLE_MASK = (1u << SWIZZLE_BITS) - 1;
+    desc->tensor_common1 |= (static_cast<uint32_t>(swizzle) & SWIZZLE_MASK) << SWIZZLE_SHIFT;
+
+    constexpr uint32_t FILL_BITS = 1;
+    constexpr uint32_t FILL_SHIFT = SWIZZLE_SHIFT + SWIZZLE_BITS;
+    constexpr uint32_t FILL_MASK = (1u << FILL_BITS) - 1;
+    desc->tensor_common1 |= (static_cast<uint32_t>(fill) & FILL_MASK) << FILL_SHIFT;
+
+    constexpr uint32_t F32_TO_TF32_BITS = 1;
+    constexpr uint32_t F32_TO_TF32_SHIFT = FILL_SHIFT + FILL_BITS;
+    constexpr uint32_t F32_TO_TF32_MASK = (1u << F32_TO_TF32_BITS) - 1;
+    desc->tensor_common1 |= (static_cast<uint32_t>(f32_to_tf32) & F32_TO_TF32_MASK)
+                            << F32_TO_TF32_SHIFT;
+
+    constexpr uint32_t PROMOTION_BITS = 2;
+    constexpr uint32_t PROMOTION_SHIFT = F32_TO_TF32_SHIFT + F32_TO_TF32_BITS;
+    constexpr uint32_t PROMOTION_MASK = (1u << PROMOTION_BITS) - 1;
+    desc->tensor_common1 |= (static_cast<uint32_t>(promotion) & PROMOTION_MASK) << PROMOTION_SHIFT;
+  }
+
+  // note that tensor stride has 1 less dim.
+  void set_tensor_stride(cudaTmaDesc* p_desc, uint64_t const (&tensor_stride_array)[NUM_DIMS - 1]) {
+    cudaTmaDescTiled* desc = reinterpret_cast<cudaTmaDescTiled*>(p_desc);
+
+    constexpr uint32_t TENSOR_STRIDE_UPPER_BITS = 4;
+    constexpr uint32_t TENSOR_STRIDE_UPPER_MASK = (1u << TENSOR_STRIDE_UPPER_BITS) - 1;
+
+    for (uint32_t i = 0; i < NUM_DIMS - 1; i++) {
+      desc->tensor_stride_lower[i] = 0u;
+      uint64_t tensor_stride_lower_64b = (tensor_stride_array[i] >> 4) & 0xFFFFFFFFlu;
+      desc->tensor_stride_lower[i] = static_cast<uint32_t>(tensor_stride_lower_64b);
+    }
+    desc->tensor_stride_upper = 0u;
+
+    for (uint32_t i = 0; i < NUM_DIMS - 1; i++) {
+      uint64_t tensor_stride_temp = tensor_stride_array[i];
+      tensor_stride_temp = tensor_stride_temp >> 4;
+      uint64_t tensor_stride_upper = tensor_stride_temp >> 32;
+      uint32_t tensor_stride_upper_32b = static_cast<uint32_t>(tensor_stride_upper);
+      desc->tensor_stride_upper |=
+          ((tensor_stride_upper_32b & TENSOR_STRIDE_UPPER_MASK) << (i * TENSOR_STRIDE_UPPER_BITS));
+    }
+  }
+
+  void set_tensor_size(cudaTmaDesc* p_desc, uint32_t const (&tensor_size_array)[NUM_DIMS]) {
+    cudaTmaDescTiled* desc = reinterpret_cast<cudaTmaDescTiled*>(p_desc);
+    for (uint32_t dim = 0; dim < NUM_DIMS; dim++) {
+      desc->tensor_size[dim] = tensor_size_array[dim] - 1;
+    }
+  }
+
+  void set_traversal_stride_tiled(cudaTmaDesc* p_desc,
+                                  uint32_t const (&traversal_stride_array)[NUM_DIMS]) {
+    cudaTmaDescTiled* desc = reinterpret_cast<cudaTmaDescTiled*>(p_desc);
+
+    desc->traversal_stride_box_0 = 0;
+
+    constexpr uint32_t TRAVERSAL_STRIDE_BITS = 3;
+    constexpr uint32_t TRAVERSAL_STRIDE_MASK = (1u << TRAVERSAL_STRIDE_BITS) - 1;
+
+    for (uint32_t dim = 0; dim < NUM_DIMS; dim++) {
+      uint32_t traversal_stride = traversal_stride_array[dim] - 1;
+      traversal_stride = (traversal_stride & TRAVERSAL_STRIDE_MASK)
+                         << (dim * TRAVERSAL_STRIDE_BITS);
+      desc->traversal_stride_box_0 |= traversal_stride;
+    }
+  }
+
+  void set_box_size(cudaTmaDesc* p_desc, uint32_t const (&box_size_array)[NUM_DIMS]) {
+    cudaTmaDescTiled* desc = reinterpret_cast<cudaTmaDescTiled*>(p_desc);
+
+    desc->box_size_end = 0;
+
+    constexpr uint32_t BOX_SIZE_BITS = 8;
+    constexpr uint32_t BOX_SIZE_MASK = (1 << BOX_SIZE_BITS) - 1;
+
+    if (NUM_DIMS > 1) {
+      uint32_t box_size_0 = box_size_array[0] - 1;
+      box_size_0 = box_size_0 & BOX_SIZE_MASK;
+      box_size_0 = box_size_0 << 24;
+      desc->traversal_stride_box_0 |= box_size_0;
+    }
+
+    for (uint32_t dim = 1; dim < NUM_DIMS; dim++) {
+      uint32_t box_size = box_size_array[dim] - 1;
+      box_size = box_size & BOX_SIZE_MASK;
+      box_size = box_size << ((dim - 1) * BOX_SIZE_BITS);
+      desc->box_size_end |= box_size;
+    }
+  }
+
+  void set_traversal_stride_im2col(cudaTmaDesc* p_desc, uint32_t* p_traversal_stride,
+                                   uint32_t dims) {
+    cudaTmaDescIm2Col* desc = reinterpret_cast<cudaTmaDescIm2Col*>(p_desc);
+
+    desc->traversal_stride_range_c = 0;
+
+    constexpr uint32_t TRAVERSAL_STRIDE_BITS = 3;
+    constexpr uint32_t TRAVERSAL_STRIDE_MASK = (1u << (TRAVERSAL_STRIDE_BITS + 1)) - 1;
+
+    for (uint32_t dim = 0; dim < dims; dim++) {
+      uint32_t traversal_stride = p_traversal_stride[dim] - 1;
+      traversal_stride = (traversal_stride & TRAVERSAL_STRIDE_MASK)
+                         << (dim * TRAVERSAL_STRIDE_BITS);
+      desc->traversal_stride_range_c |= traversal_stride;
+    }
+  }
+
+  void set_range_c(cudaTmaDesc* p_desc, uint32_t range_c) {
+    cudaTmaDescIm2Col* desc = reinterpret_cast<cudaTmaDescIm2Col*>(p_desc);
+
+    constexpr uint32_t RANGE_C_BITS = 8;
+    constexpr uint32_t RANGE_C_MASK = (1u << RANGE_C_BITS) - 1;
+
+    range_c = range_c & RANGE_C_MASK;
+    desc->traversal_stride_range_c |= ((range_c - 1) << 24);
+  }
+
+  void set_box_corner_dhw(cudaTmaDesc* p_desc, uint32_t* p_base_corner, uint32_t* p_far_corner,
+                          uint32_t dims) {
+    cudaTmaDescIm2Col* desc = reinterpret_cast<cudaTmaDescIm2Col*>(p_desc);
+
+    desc->box_corner_dhw = 0;
+
+    uint32_t box_base_corner = 0, box_far_corner = 0;
+    uint32_t box_corner_dhw = 0;
+
+    if (dims == 3) {
+      constexpr uint32_t BOX_CORNER_BITS = 16;
+      constexpr uint32_t BOX_CORNER_MASK = (1u << BOX_CORNER_BITS) - 1;
+
+      box_base_corner = p_base_corner[0] & BOX_CORNER_MASK;
+      box_far_corner = p_far_corner[0] & BOX_CORNER_MASK;
+    }
+
+    if (dims == 4) {
+      constexpr uint32_t BOX_CORNER_BITS = 8;
+      constexpr uint32_t BOX_CORNER_MASK = (1u << BOX_CORNER_BITS) - 1;
+
+      box_base_corner = p_base_corner[0] & BOX_CORNER_MASK;
+      box_base_corner |= ((p_base_corner[1] & BOX_CORNER_MASK) << BOX_CORNER_BITS);
+
+      box_far_corner = p_far_corner[0] & BOX_CORNER_MASK;
+      box_far_corner |= ((p_far_corner[1] & BOX_CORNER_MASK) << BOX_CORNER_BITS);
+    }
+
+    if (dims == 5) {
+      constexpr uint32_t BOX_CORNER_BITS = 5;
+      constexpr uint32_t BOX_CORNER_MASK = (1u << BOX_CORNER_BITS) - 1;
+
+      box_base_corner = p_base_corner[0] & BOX_CORNER_MASK;
+      box_base_corner |= ((p_base_corner[1] & BOX_CORNER_MASK) << BOX_CORNER_BITS);
+      box_base_corner |= ((p_base_corner[2] & BOX_CORNER_MASK) << (2 * BOX_CORNER_BITS));
+
+      box_far_corner = p_far_corner[0] & BOX_CORNER_MASK;
+      box_far_corner |= ((p_far_corner[1] & BOX_CORNER_MASK) << BOX_CORNER_BITS);
+      box_far_corner |= ((p_far_corner[2] & BOX_CORNER_MASK) << (2 * BOX_CORNER_BITS));
+    }
+
+    box_corner_dhw = box_base_corner;
+    box_corner_dhw |= (box_far_corner << 16);
+
+    desc->box_corner_dhw = box_corner_dhw;
+  }
+
+  void set_range_ndhw(cudaTmaDesc* p_desc, uint32_t ndhw) {
+    cudaTmaDescIm2Col* desc = reinterpret_cast<cudaTmaDescIm2Col*>(p_desc);
+
+    desc->range_ndhw = 0;
+
+    constexpr uint32_t RANGE_NDHW_BITS = 10;
+    constexpr uint32_t RANGE_NDHW_MASK = (1u << RANGE_NDHW_BITS) - 1;
+
+    desc->range_ndhw = ((ndhw - 1) & RANGE_NDHW_MASK);
+  }
+
+  // The TMA descriptor. Each is of 512 bit.
+  cudaTmaDesc* desc_ptr_h;
+  // The TMA descriptor on the device memory.
+  cudaTmaDesc* desc_ptr_d;
+  // Number of batches
+  int batch_size = 0;
+};
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/hopper/tma_types.h b/csrc/fmha_v2/fmha/hopper/tma_types.h
new file mode 100644
index 0000000000..4f5460ef64
--- /dev/null
+++ b/csrc/fmha_v2/fmha/hopper/tma_types.h
@@ -0,0 +1,123 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/numeric_types.h>
+
+namespace fmha {
+
+// TMA desc type.
+typedef enum { TILED = 0, IM2COL } cudaTmaDescType;
+
+// TMA swizzle type.
+typedef enum {
+  SWIZZLE_DISABLED,
+  SWIZZLE_32B,
+  SWIZZLE_64B,
+  SWIZZLE_128B,
+  SWIZZLE_MAX
+} cudaTmaDescSwizzle;
+
+typedef enum { BARRIER64, BARRIER128 } cudaTmaDescBarrier;
+
+// TMA interleave type.
+typedef enum {
+  INTERLEAVE_DISABLED,
+  INTERLEAVE_16B,
+  INTERLEAVE_32B,
+  INTERLEAVE_MAX
+} cudaTmaDescInterleave;
+
+// TMA L2 sector promotion.
+typedef enum {
+  PROMOTION_DISABLED = 0,
+  PROMOTION_64B,
+  PROMOTION_128B,
+  PROMOTION_256B
+} cudaTmaDescPromotion;
+
+// TMA data type.
+typedef enum {
+  U8 = 0,
+  U16,
+  U32,
+  S32,
+  U64,
+  S64,
+  F16_RN,
+  F32_RN,
+  F32_FTZ_RN,
+  F64_RN,
+  BF16_RN,
+  FORMAT_MAX
+} cudaTmaDescFormat;
+
+// TMA cache control.
+typedef enum {
+  PREFETCH,       // Prefetch tma descriptor using global memory address
+  INVALIDATE,     // Invalidate tma descriptor in l2 cache
+  INVALIDATE_ALL  // Invalidate tma descriptor and all elements in l2 cache line
+} cudaTmaDescCacheCtrl;
+
+// TMA OOB fill modes.
+typedef enum { TENSOR_ZFILL, TENSOR_CFILL } cudaTmaDescOobFillMode;
+
+constexpr uint64_t k_max_tensor_size = (1llu << 36);
+constexpr uint64_t k_max_tensor_stride = (1llu << 36);
+constexpr uint64_t k_max_block_size = 256llu;
+constexpr uint64_t k_max_traversal_stride = (1llu << 3);
+
+constexpr uint64_t k_min_tensor_size = 1llu;
+constexpr uint64_t k_min_tensor_stride = 0llu;
+constexpr uint64_t k_min_block_size = 1llu;
+constexpr uint64_t k_min_traversal_stride = 1llu;
+
+constexpr uint32_t k_max_cta_id = (1 << 6) - 1;
+
+// The 512 bit of descriptor for tiled mode.
+typedef struct {
+  uint64_t tensor_common0;
+  uint32_t tensor_common1;
+
+  uint32_t tensor_stride_lower[4];  //< 36b of 64b with 4B aligned
+  uint32_t tensor_stride_upper;
+  uint32_t tensor_size[5];          //< value -1
+  uint32_t traversal_stride_box_0;  //< packed 3b (-1)
+
+  uint32_t box_size_end;
+} cudaTmaDescTiled;
+
+// The 512 bit of descritptro for im2col mode.
+typedef struct {
+  uint64_t tensor_common0;
+  uint32_t tensor_common1;
+
+  uint32_t tensor_stride_lower[4];
+  uint32_t tensor_stride_upper;
+  uint32_t tensor_size[5];
+  uint32_t traversal_stride_range_c;
+
+  uint32_t box_corner_dhw;
+  uint32_t range_ndhw;
+} cudaTmaDescIm2Col;
+
+// TMA desc size
+constexpr uint32_t TMA_DESC_SIZE_IN_BYTE = 64;
+
+// TMA desc
+typedef struct alignas(64) {
+  uint64_t data[8];
+} cudaTmaDesc;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/hopper/utils_gmma.h b/csrc/fmha_v2/fmha/hopper/utils_gmma.h
new file mode 100644
index 0000000000..cc070be7de
--- /dev/null
+++ b/csrc/fmha_v2/fmha/hopper/utils_gmma.h
@@ -0,0 +1,18 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/hopper/utils_hgmma.h>
+#include <fmha/hopper/utils_hgmma_bf16.h>
+#include <fmha/hopper/utils_igmma.h>
+#include <fmha/hopper/utils_qgmma.h>
diff --git a/csrc/fmha_v2/fmha/hopper/utils_hgmma.h b/csrc/fmha_v2/fmha/hopper/utils_hgmma.h
new file mode 100644
index 0000000000..5112317228
--- /dev/null
+++ b/csrc/fmha_v2/fmha/hopper/utils_hgmma.h
@@ -0,0 +1,874 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// GMMAs with fp16 Accumulator
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, bool TA, bool TB>
+struct Hgmma_fp16 {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x8x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB>
+struct Hgmma_fp16<8, TA, TB> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[2]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_a = TA ? 1 : 0;
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n8k16.f16.f16.f16\n"
+        "{\n"
+        "   %0, %1\n"
+        "}, %2, %3, 1, 1, 1, %4, %5;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1])
+        : "l"(desc_a), "l"(desc_b), "n"(trans_a), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x32x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB>
+struct Hgmma_fp16<32, TA, TB> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[8]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_a = TA ? 1 : 0;
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n32k16.f16.f16.f16\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7 \n"
+        "},\n"
+        "  %8, %9, 1, 1, 1, %10, %11;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7])
+        : "l"(desc_a), "l"(desc_b), "n"(trans_a), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x64x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB>
+struct Hgmma_fp16<64, TA, TB> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[16]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_a = TA ? 1 : 0;
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n64k16.f16.f16.f16\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15 \n"
+        "},\n"
+        "  %16, %17, 1, 1, 1, %18, %19;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15])
+        : "l"(desc_a), "l"(desc_b), "n"(trans_a), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x128x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB>
+struct Hgmma_fp16<128, TA, TB> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[32]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_a = TA ? 1 : 0;
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n128k16.f16.f16.f16\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31 \n"
+        "},\n"
+        "  %32, %33, 1, 1, 1, %34, %35;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31])
+        : "l"(desc_a), "l"(desc_b), "n"(trans_a), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x192x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB>
+struct Hgmma_fp16<192, TA, TB> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[48]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_a = TA ? 1 : 0;
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n192k16.f16.f16.f16\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31,\n"
+        "  %32, %33, %34, %35, %36, %37, %38, %39,\n"
+        "  %40, %41, %42, %43, %44, %45, %46, %47 \n"
+        "},\n"
+        "  %48, %49, 1, 1, 1, %50, %51;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47])
+        : "l"(desc_a), "l"(desc_b), "n"(trans_a), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x256x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB>
+struct Hgmma_fp16<256, TA, TB> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[64]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_a = TA ? 1 : 0;
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n256k16.f16.f16.f16\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31,\n"
+        "  %32, %33, %34, %35, %36, %37, %38, %39,\n"
+        "  %40, %41, %42, %43, %44, %45, %46, %47,\n"
+        "  %48, %49, %50, %51, %52, %53, %54, %55,\n"
+        "  %56, %57, %58, %59, %60, %61, %62, %63 \n"
+        "},\n"
+        "  %64, %65, 1, 1, 1, %66, %67;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63])
+        : "l"(desc_a), "l"(desc_b), "n"(trans_a), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB, int N, bool /*ignored*/>
+inline __device__ void hgmma_fp16(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[N / 4]) {
+  Hgmma_fp16<N, TA, TB>::mma(desc_a, desc_b, acc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// GMMAs with fp32 Accumulator
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, bool TA, bool TB>
+struct Hgmma_fp32 {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x8x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB>
+struct Hgmma_fp32<8, TA, TB> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[4]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_a = TA ? 1 : 0;
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n8k16.f32.f16.f16\n"
+        "{%0, %1, %2, %3}, %4, %5, 1, 1, 1, %6, %7;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3])
+        : "l"(desc_a), "l"(desc_b), "n"(trans_a), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x64x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB>
+struct Hgmma_fp32<64, TA, TB> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[32]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_a = TA ? 1 : 0;
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n64k16.f32.f16.f16\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31 \n"
+        "},\n"
+        "  %32, %33, 1, 1, 1, %34, %35;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31])
+        : "l"(desc_a), "l"(desc_b), "n"(trans_a), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x128x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB>
+struct Hgmma_fp32<128, TA, TB> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[64]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_a = TA ? 1 : 0;
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31,\n"
+        "  %32, %33, %34, %35, %36, %37, %38, %39,\n"
+        "  %40, %41, %42, %43, %44, %45, %46, %47,\n"
+        "  %48, %49, %50, %51, %52, %53, %54, %55,\n"
+        "  %56, %57, %58, %59, %60, %61, %62, %63 \n"
+        "},\n"
+        "  %64, %65, 1, 1, 1, %66, %67;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63])
+        : "l"(desc_a), "l"(desc_b), "n"(trans_a), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x192x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB>
+struct Hgmma_fp32<192, TA, TB> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[96]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_a = TA ? 1 : 0;
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n192k16.f32.f16.f16\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95 \n"
+        "},\n"
+        "  %96, %97, 1, 1, 1, %98, %99;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95])
+        : "l"(desc_a), "l"(desc_b), "n"(trans_a), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x256x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB>
+struct Hgmma_fp32<256, TA, TB> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[128]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_a = TA ? 1 : 0;
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n256k16.f32.f16.f16\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,\n"
+        "   %96,  %97,  %98,  %99, %100, %101, %102, %103,\n"
+        "  %104, %105, %106, %107, %108, %109, %110, %111,\n"
+        "  %112, %113, %114, %115, %116, %117, %118, %119,\n"
+        "  %120, %121, %122, %123, %124, %125, %126, %127 \n"
+        "},\n"
+        "  %128, %129, 1, 1, 1, %130, %131;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95]),
+          "+r"(acc[96]), "+r"(acc[97]), "+r"(acc[98]), "+r"(acc[99]), "+r"(acc[100]),
+          "+r"(acc[101]), "+r"(acc[102]), "+r"(acc[103]), "+r"(acc[104]), "+r"(acc[105]),
+          "+r"(acc[106]), "+r"(acc[107]), "+r"(acc[108]), "+r"(acc[109]), "+r"(acc[110]),
+          "+r"(acc[111]), "+r"(acc[112]), "+r"(acc[113]), "+r"(acc[114]), "+r"(acc[115]),
+          "+r"(acc[116]), "+r"(acc[117]), "+r"(acc[118]), "+r"(acc[119]), "+r"(acc[120]),
+          "+r"(acc[121]), "+r"(acc[122]), "+r"(acc[123]), "+r"(acc[124]), "+r"(acc[125]),
+          "+r"(acc[126]), "+r"(acc[127])
+        : "l"(desc_a), "l"(desc_b), "n"(trans_a), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB, int N, bool /*ignored*/>
+inline __device__ void hgmma_fp32(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[N / 2]) {
+  Hgmma_fp32<N, TA, TB>::mma(desc_a, desc_b, acc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// GMMAs with fp16 Accumulator, where A is coming from RF
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, bool TB>
+struct Hgmma_rfa_fp16 {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x8x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_fp16<8, TB> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[2]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n8k16.f16.f16.f16 "
+        "{%0, %1}, {%2, %3, %4, %5}, %6, 1, 1, 1, %7;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_a), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x16x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_fp16<16, TB> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[4]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n16k16.f16.f16.f16 "
+        "{ %0,  %1,  %2,  %3 },\n"
+        "{ %4, %5, %6, %7 }, %8, 1, 1, 1, %9;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x32x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_fp16<32, TB> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[8]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n32k16.f16.f16.f16 "
+        "{ %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7 },\n"
+        "{ %8, %9, %10, %11 }, %12, 1, 1, 1, %13;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x64x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_fp16<64, TB> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[16]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n64k16.f16.f16.f16 "
+        "{"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15 \n"
+        "},\n"
+        "{ %16, %17, %18, %19 }, %20, 1, 1, 1, %21;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x128x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_fp16<128, TB> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[32]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n128k16.f16.f16.f16 "
+        "{"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31 \n"
+        "},\n"
+        "{ %32, %33, %34, %35 }, %36, 1, 1, 1, %37;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x192x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_fp16<192, TB> {
+  static inline __device__ void mma(const uint32_t (&a)[4], uint64_t desc_b, uint32_t (&acc)[48]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n192k16.f16.f16.f16 "
+        "{"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31,\n"
+        "  %32, %33, %34, %35, %36, %37, %38, %39,\n"
+        "  %40, %41, %42, %43, %44, %45, %46, %47 \n"
+        "},\n"
+        "{ %48, %49, %50, %51 }, %52, 1, 1, 1, %53;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x256x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_fp16<256, TB> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[64]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n256k16.f16.f16.f16 "
+        "{"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31,\n"
+        "  %32, %33, %34, %35, %36, %37, %38, %39,\n"
+        "  %40, %41, %42, %43, %44, %45, %46, %47,\n"
+        "  %48, %49, %50, %51, %52, %53, %54, %55,\n"
+        "  %56, %57, %58, %59, %60, %61, %62, %63 \n"
+        "},\n"
+        "{ %64, %65, %66, %67 }, %68, 1, 1, 1, %69;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB, int N, bool /*ignored*/>
+inline __device__ void hgmma_rfa_fp16(uint32_t const (&a)[4], uint64_t desc_b,
+                                      uint32_t (&acc)[N / 4]) {
+  Hgmma_rfa_fp16<N, TB>::mma(a, desc_b, acc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// GMMAs with fp32 Accumulator, where A is coming from RF
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, bool TB>
+struct Hgmma_rfa_fp32 {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x8x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_fp32<8, TB> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[4]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n8k16.f32.f16.f16\n"
+        "{\n"
+        "  %0, %1, %2, %3\n"
+        "}\n,"
+        "{ %4, %5, %6, %7 }, %8, 1, 1, 1, %9;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x32x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_fp32<32, TB> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[16]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15 \n"
+        "},\n"
+        "{ %16, %17, %18, %19 }, %20, 1, 1, 1, %21;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x64x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_fp32<64, TB> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[32]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n64k16.f32.f16.f16\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31 \n"
+        "},\n"
+        "{ %32, %33, %34, %35 }, %36, 1, 1, 1, %37;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x128x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_fp32<128, TB> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[64]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n128k16.f32.f16.f16\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31,\n"
+        "  %32, %33, %34, %35, %36, %37, %38, %39,\n"
+        "  %40, %41, %42, %43, %44, %45, %46, %47,\n"
+        "  %48, %49, %50, %51, %52, %53, %54, %55,\n"
+        "  %56, %57, %58, %59, %60, %61, %62, %63 \n"
+        "},\n"
+        "{ %64, %65, %66, %67 }, %68, 1, 1, 1, %69;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x192x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_fp32<192, TB> {
+  static inline __device__ void mma(const uint32_t (&a)[4], uint64_t desc_b, uint32_t (&acc)[96]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n192k16.f32.f16.f16\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95 \n"
+        "},\n"
+        "{ %96, %97, %98, %99 }, %100, 1, 1, 1, %101;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x256x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_fp32<256, TB> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[128]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n256k16.f32.f16.f16\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,\n"
+        "   %96,  %97,  %98,  %99, %100, %101, %102, %103,\n"
+        "  %104, %105, %106, %107, %108, %109, %110, %111,\n"
+        "  %112, %113, %114, %115, %116, %117, %118, %119,\n"
+        "  %120, %121, %122, %123, %124, %125, %126, %127 \n"
+        "},\n"
+        "{ %128, %129, %130, %131 }, %132, 1, 1, 1, %133;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95]),
+          "+r"(acc[96]), "+r"(acc[97]), "+r"(acc[98]), "+r"(acc[99]), "+r"(acc[100]),
+          "+r"(acc[101]), "+r"(acc[102]), "+r"(acc[103]), "+r"(acc[104]), "+r"(acc[105]),
+          "+r"(acc[106]), "+r"(acc[107]), "+r"(acc[108]), "+r"(acc[109]), "+r"(acc[110]),
+          "+r"(acc[111]), "+r"(acc[112]), "+r"(acc[113]), "+r"(acc[114]), "+r"(acc[115]),
+          "+r"(acc[116]), "+r"(acc[117]), "+r"(acc[118]), "+r"(acc[119]), "+r"(acc[120]),
+          "+r"(acc[121]), "+r"(acc[122]), "+r"(acc[123]), "+r"(acc[124]), "+r"(acc[125]),
+          "+r"(acc[126]), "+r"(acc[127])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB, int N, bool /*ignored*/>
+inline __device__ void hgmma_rfa_fp32(uint32_t const (&a)[4], uint64_t desc_b,
+                                      uint32_t (&acc)[N / 2]) {
+  Hgmma_rfa_fp32<N, TB>::mma(a, desc_b, acc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/hopper/utils_hgmma_bf16.h b/csrc/fmha_v2/fmha/hopper/utils_hgmma_bf16.h
new file mode 100644
index 0000000000..7b17b508bb
--- /dev/null
+++ b/csrc/fmha_v2/fmha/hopper/utils_hgmma_bf16.h
@@ -0,0 +1,475 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// BF16 GMMAs with FP32 Accumulator
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, bool TA, bool TB>
+struct Hgmma_bf16 {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x8x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB>
+struct Hgmma_bf16<8, TA, TB> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[4]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_a = TA ? 1 : 0;
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n8k16.f32.bf16.bf16\n"
+        "{%0, %1, %2, %3}, %4, %5, 1, 1, 1, %6, %7;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3])
+        : "l"(desc_a), "l"(desc_b), "n"(trans_a), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x64x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB>
+struct Hgmma_bf16<64, TA, TB> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[32]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_a = TA ? 1 : 0;
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31 \n"
+        "},\n"
+        "  %32, %33, 1, 1, 1, %34, %35;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31])
+        : "l"(desc_a), "l"(desc_b), "n"(trans_a), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x128x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB>
+struct Hgmma_bf16<128, TA, TB> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[64]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_a = TA ? 1 : 0;
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31,\n"
+        "  %32, %33, %34, %35, %36, %37, %38, %39,\n"
+        "  %40, %41, %42, %43, %44, %45, %46, %47,\n"
+        "  %48, %49, %50, %51, %52, %53, %54, %55,\n"
+        "  %56, %57, %58, %59, %60, %61, %62, %63 \n"
+        "},\n"
+        "  %64, %65, 1, 1, 1, %66, %67;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63])
+        : "l"(desc_a), "l"(desc_b), "n"(trans_a), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x192x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB>
+struct Hgmma_bf16<192, TA, TB> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[96]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_a = TA ? 1 : 0;
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n192k16.f32.bf16.bf16\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95 \n"
+        "},\n"
+        "  %96, %97, 1, 1, 1, %98, %99;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95])
+        : "l"(desc_a), "l"(desc_b), "n"(trans_a), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x256x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB>
+struct Hgmma_bf16<256, TA, TB> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[128]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_a = TA ? 1 : 0;
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n256k16.f32.bf16.bf16\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,\n"
+        "   %96,  %97,  %98,  %99, %100, %101, %102, %103,\n"
+        "  %104, %105, %106, %107, %108, %109, %110, %111,\n"
+        "  %112, %113, %114, %115, %116, %117, %118, %119,\n"
+        "  %120, %121, %122, %123, %124, %125, %126, %127 \n"
+        "},\n"
+        "  %128, %129, 1, 1, 1, %130, %131;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95]),
+          "+r"(acc[96]), "+r"(acc[97]), "+r"(acc[98]), "+r"(acc[99]), "+r"(acc[100]),
+          "+r"(acc[101]), "+r"(acc[102]), "+r"(acc[103]), "+r"(acc[104]), "+r"(acc[105]),
+          "+r"(acc[106]), "+r"(acc[107]), "+r"(acc[108]), "+r"(acc[109]), "+r"(acc[110]),
+          "+r"(acc[111]), "+r"(acc[112]), "+r"(acc[113]), "+r"(acc[114]), "+r"(acc[115]),
+          "+r"(acc[116]), "+r"(acc[117]), "+r"(acc[118]), "+r"(acc[119]), "+r"(acc[120]),
+          "+r"(acc[121]), "+r"(acc[122]), "+r"(acc[123]), "+r"(acc[124]), "+r"(acc[125]),
+          "+r"(acc[126]), "+r"(acc[127])
+        : "l"(desc_a), "l"(desc_b), "n"(trans_a), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TA, bool TB, int N, bool /*ignored*/>
+inline __device__ void hgmma_bf16(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[N / 2]) {
+  Hgmma_bf16<N, TA, TB>::mma(desc_a, desc_b, acc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// BF16 GMMAs with FP32 Accumulator, where A is coming from RF
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, bool TB>
+struct Hgmma_rfa_bf16 {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x8x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_bf16<8, TB> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[4]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n8k16.f32.bf16.bf16\n"
+        "{\n"
+        "  %0, %1, %2, %3\n"
+        "}\n,"
+        "{ %4, %5, %6, %7 }, %8, 1, 1, 1, %9;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x32x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_bf16<32, TB> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[16]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n32k16.f32.bf16.bf16\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15 \n"
+        "},\n"
+        "{ %16, %17, %18, %19 }, %20, 1, 1, 1, %21;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x64x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_bf16<64, TB> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[32]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31 \n"
+        "},\n"
+        "{ %32, %33, %34, %35 }, %36, 1, 1, 1, %37;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x128x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_bf16<128, TB> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[64]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31,\n"
+        "  %32, %33, %34, %35, %36, %37, %38, %39,\n"
+        "  %40, %41, %42, %43, %44, %45, %46, %47,\n"
+        "  %48, %49, %50, %51, %52, %53, %54, %55,\n"
+        "  %56, %57, %58, %59, %60, %61, %62, %63 \n"
+        "},\n"
+        "{ %64, %65, %66, %67 }, %68, 1, 1, 1, %69;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x192x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_bf16<192, TB> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[96]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n192k16.f32.bf16.bf16\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95 \n"
+        "},\n"
+        "{ %96, %97, %98, %99 }, %100, 1, 1, 1, %101;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x256x16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB>
+struct Hgmma_rfa_bf16<256, TB> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[128]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    int const trans_b = TB ? 1 : 0;
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n256k16.f32.bf16.bf16\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,\n"
+        "   %96,  %97,  %98,  %99, %100, %101, %102, %103,\n"
+        "  %104, %105, %106, %107, %108, %109, %110, %111,\n"
+        "  %112, %113, %114, %115, %116, %117, %118, %119,\n"
+        "  %120, %121, %122, %123, %124, %125, %126, %127 \n"
+        "},\n"
+        "{ %128, %129, %130, %131 }, %132, 1, 1, 1, %133;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95]),
+          "+r"(acc[96]), "+r"(acc[97]), "+r"(acc[98]), "+r"(acc[99]), "+r"(acc[100]),
+          "+r"(acc[101]), "+r"(acc[102]), "+r"(acc[103]), "+r"(acc[104]), "+r"(acc[105]),
+          "+r"(acc[106]), "+r"(acc[107]), "+r"(acc[108]), "+r"(acc[109]), "+r"(acc[110]),
+          "+r"(acc[111]), "+r"(acc[112]), "+r"(acc[113]), "+r"(acc[114]), "+r"(acc[115]),
+          "+r"(acc[116]), "+r"(acc[117]), "+r"(acc[118]), "+r"(acc[119]), "+r"(acc[120]),
+          "+r"(acc[121]), "+r"(acc[122]), "+r"(acc[123]), "+r"(acc[124]), "+r"(acc[125]),
+          "+r"(acc[126]), "+r"(acc[127])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b), "n"(trans_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool TB, int N, bool /*ignored*/>
+inline __device__ void hgmma_rfa_bf16(uint32_t const (&a)[4], uint64_t desc_b,
+                                      uint32_t (&acc)[N / 2]) {
+  Hgmma_rfa_bf16<N, TB>::mma(a, desc_b, acc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/hopper/utils_igmma.h b/csrc/fmha_v2/fmha/hopper/utils_igmma.h
new file mode 100644
index 0000000000..fcced80616
--- /dev/null
+++ b/csrc/fmha_v2/fmha/hopper/utils_igmma.h
@@ -0,0 +1,396 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// IGMMA 64xNx32 TN with int32 Accumulator with A and B from SMEM
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct Igmma_int8_int32 {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x64x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Igmma_int8_int32<64> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[32]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.s8\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31 \n"
+        "},\n"
+        "  %32, %33, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x128x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Igmma_int8_int32<128> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[64]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.s8\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31,\n"
+        "  %32, %33, %34, %35, %36, %37, %38, %39,\n"
+        "  %40, %41, %42, %43, %44, %45, %46, %47,\n"
+        "  %48, %49, %50, %51, %52, %53, %54, %55,\n"
+        "  %56, %57, %58, %59, %60, %61, %62, %63 \n"
+        "},\n"
+        "  %64, %65, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x192x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Igmma_int8_int32<192> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[96]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.s8\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95 \n"
+        "},\n"
+        "   %96, %97, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x256x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Igmma_int8_int32<256> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[128]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.s8\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,\n"
+        "   %96,  %97,  %98,  %99, %100, %101, %102, %103,\n"
+        "  %104, %105, %106, %107, %108, %109, %110, %111,\n"
+        "  %112, %113, %114, %115, %116, %117, %118, %119,\n"
+        "  %120, %121, %122, %123, %124, %125, %126, %127 \n"
+        "},\n"
+        "  %128, %129, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95]),
+          "+r"(acc[96]), "+r"(acc[97]), "+r"(acc[98]), "+r"(acc[99]), "+r"(acc[100]),
+          "+r"(acc[101]), "+r"(acc[102]), "+r"(acc[103]), "+r"(acc[104]), "+r"(acc[105]),
+          "+r"(acc[106]), "+r"(acc[107]), "+r"(acc[108]), "+r"(acc[109]), "+r"(acc[110]),
+          "+r"(acc[111]), "+r"(acc[112]), "+r"(acc[113]), "+r"(acc[114]), "+r"(acc[115]),
+          "+r"(acc[116]), "+r"(acc[117]), "+r"(acc[118]), "+r"(acc[119]), "+r"(acc[120]),
+          "+r"(acc[121]), "+r"(acc[122]), "+r"(acc[123]), "+r"(acc[124]), "+r"(acc[125]),
+          "+r"(acc[126]), "+r"(acc[127])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, bool /*ignored*/>
+inline __device__ void igmma_int8_int32(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[N / 2]) {
+  Igmma_int8_int32<N>::mma(desc_a, desc_b, acc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// IGMMA 64xNx32 TN with int32 Accumulator with A from RF and B from SMEM.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct Igmma_rfa_int8_int32 {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x64x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Igmma_rfa_int8_int32<64> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[32]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n64k32.s32.s8.s8\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31 \n"
+        "},\n"
+        "{ %32, %33, %34, %35 }, %36, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x128x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Igmma_rfa_int8_int32<128> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[64]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n128k32.s32.s8.s8\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31,\n"
+        "  %32, %33, %34, %35, %36, %37, %38, %39,\n"
+        "  %40, %41, %42, %43, %44, %45, %46, %47,\n"
+        "  %48, %49, %50, %51, %52, %53, %54, %55,\n"
+        "  %56, %57, %58, %59, %60, %61, %62, %63 \n"
+        "},\n"
+        "{ %64, %65, %66, %67 }, %68, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x192x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Igmma_rfa_int8_int32<192> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[96]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n192k32.s32.s8.s8\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95 \n"
+        "},\n"
+        "{ %96, %97, %98, %99 }, %100, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x256x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Igmma_rfa_int8_int32<256> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[128]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n256k32.s32.s8.s8\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,\n"
+        "   %96,  %97,  %98,  %99, %100, %101, %102, %103,\n"
+        "  %104, %105, %106, %107, %108, %109, %110, %111,\n"
+        "  %112, %113, %114, %115, %116, %117, %118, %119,\n"
+        "  %120, %121, %122, %123, %124, %125, %126, %127 \n"
+        "},\n"
+        "{ %128, %129, %130, %131 }, %132, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95]),
+          "+r"(acc[96]), "+r"(acc[97]), "+r"(acc[98]), "+r"(acc[99]), "+r"(acc[100]),
+          "+r"(acc[101]), "+r"(acc[102]), "+r"(acc[103]), "+r"(acc[104]), "+r"(acc[105]),
+          "+r"(acc[106]), "+r"(acc[107]), "+r"(acc[108]), "+r"(acc[109]), "+r"(acc[110]),
+          "+r"(acc[111]), "+r"(acc[112]), "+r"(acc[113]), "+r"(acc[114]), "+r"(acc[115]),
+          "+r"(acc[116]), "+r"(acc[117]), "+r"(acc[118]), "+r"(acc[119]), "+r"(acc[120]),
+          "+r"(acc[121]), "+r"(acc[122]), "+r"(acc[123]), "+r"(acc[124]), "+r"(acc[125]),
+          "+r"(acc[126]), "+r"(acc[127])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, bool /*ignored*/>
+inline __device__ void igmma_rfa_int8_int32(uint32_t const (&a)[4], uint64_t desc_b,
+                                            uint32_t (&acc)[N / 2]) {
+  Igmma_rfa_int8_int32<N>::mma(a, desc_b, acc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/hopper/utils_qgmma.h b/csrc/fmha_v2/fmha/hopper/utils_qgmma.h
new file mode 100644
index 0000000000..28571b15b9
--- /dev/null
+++ b/csrc/fmha_v2/fmha/hopper/utils_qgmma.h
@@ -0,0 +1,2089 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// QGMMA 64xNx32 TN with int32 Accumulator with A and B from SMEM
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct Qgmma_e4m3_e4m3_fp32 {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x32x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e4m3_e4m3_fp32<32> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[16]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n32k32.f32.e4m3.e4m3\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15\n"
+        "},\n"
+        "  %16, %17, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x64x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e4m3_e4m3_fp32<64> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[32]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31 \n"
+        "},\n"
+        "  %32, %33, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x128x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e4m3_e4m3_fp32<128> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[64]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e4m3\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31,\n"
+        "  %32, %33, %34, %35, %36, %37, %38, %39,\n"
+        "  %40, %41, %42, %43, %44, %45, %46, %47,\n"
+        "  %48, %49, %50, %51, %52, %53, %54, %55,\n"
+        "  %56, %57, %58, %59, %60, %61, %62, %63 \n"
+        "},\n"
+        "  %64, %65, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x192x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e4m3_e4m3_fp32<192> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[96]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n192k32.f32.e4m3.e4m3\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95 \n"
+        "},\n"
+        "   %96, %97, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x256x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e4m3_e4m3_fp32<256> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[128]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n256k32.f32.e4m3.e4m3\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,\n"
+        "   %96,  %97,  %98,  %99, %100, %101, %102, %103,\n"
+        "  %104, %105, %106, %107, %108, %109, %110, %111,\n"
+        "  %112, %113, %114, %115, %116, %117, %118, %119,\n"
+        "  %120, %121, %122, %123, %124, %125, %126, %127 \n"
+        "},\n"
+        "  %128, %129, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95]),
+          "+r"(acc[96]), "+r"(acc[97]), "+r"(acc[98]), "+r"(acc[99]), "+r"(acc[100]),
+          "+r"(acc[101]), "+r"(acc[102]), "+r"(acc[103]), "+r"(acc[104]), "+r"(acc[105]),
+          "+r"(acc[106]), "+r"(acc[107]), "+r"(acc[108]), "+r"(acc[109]), "+r"(acc[110]),
+          "+r"(acc[111]), "+r"(acc[112]), "+r"(acc[113]), "+r"(acc[114]), "+r"(acc[115]),
+          "+r"(acc[116]), "+r"(acc[117]), "+r"(acc[118]), "+r"(acc[119]), "+r"(acc[120]),
+          "+r"(acc[121]), "+r"(acc[122]), "+r"(acc[123]), "+r"(acc[124]), "+r"(acc[125]),
+          "+r"(acc[126]), "+r"(acc[127])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, bool /*ignored*/>
+inline __device__ void qgmma_e4m3_e4m3_fp32(uint64_t desc_a, uint64_t desc_b,
+                                            uint32_t (&acc)[N / 2]) {
+  Qgmma_e4m3_e4m3_fp32<N>::mma(desc_a, desc_b, acc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// QGMMA 64xNx32 TN with int32 Accumulator with A from RF and B from SMEM
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct Qgmma_rfa_e4m3_e4m3_fp32 {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x32x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e4m3_e4m3_fp32<32> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[16]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n32k32.f32.e4m3.e4m3\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15\n"
+        "},\n"
+        "{ %16, %17, %18, %19 }, %20, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x64x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e4m3_e4m3_fp32<64> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[32]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31 \n"
+        "},\n"
+        "{ %32, %33, %34, %35 }, %36, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x128x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e4m3_e4m3_fp32<128> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[64]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e4m3\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31,\n"
+        "  %32, %33, %34, %35, %36, %37, %38, %39,\n"
+        "  %40, %41, %42, %43, %44, %45, %46, %47,\n"
+        "  %48, %49, %50, %51, %52, %53, %54, %55,\n"
+        "  %56, %57, %58, %59, %60, %61, %62, %63 \n"
+        "},\n"
+        "{ %64, %65, %66, %67 }, %68, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x192x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e4m3_e4m3_fp32<192> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[96]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n192k32.f32.e4m3.e4m3\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95 \n"
+        "},\n"
+        "{ %96, %97, %98, %99 }, %100, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x256x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e4m3_e4m3_fp32<256> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[128]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n256k32.f32.e4m3.e4m3\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,\n"
+        "   %96,  %97,  %98,  %99, %100, %101, %102, %103,\n"
+        "  %104, %105, %106, %107, %108, %109, %110, %111,\n"
+        "  %112, %113, %114, %115, %116, %117, %118, %119,\n"
+        "  %120, %121, %122, %123, %124, %125, %126, %127 \n"
+        "},\n"
+        "{ %128, %129, %130, %131 }, %132, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95]),
+          "+r"(acc[96]), "+r"(acc[97]), "+r"(acc[98]), "+r"(acc[99]), "+r"(acc[100]),
+          "+r"(acc[101]), "+r"(acc[102]), "+r"(acc[103]), "+r"(acc[104]), "+r"(acc[105]),
+          "+r"(acc[106]), "+r"(acc[107]), "+r"(acc[108]), "+r"(acc[109]), "+r"(acc[110]),
+          "+r"(acc[111]), "+r"(acc[112]), "+r"(acc[113]), "+r"(acc[114]), "+r"(acc[115]),
+          "+r"(acc[116]), "+r"(acc[117]), "+r"(acc[118]), "+r"(acc[119]), "+r"(acc[120]),
+          "+r"(acc[121]), "+r"(acc[122]), "+r"(acc[123]), "+r"(acc[124]), "+r"(acc[125]),
+          "+r"(acc[126]), "+r"(acc[127])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, bool /*ignored*/>
+inline __device__ void qgmma_rfa_e4m3_e4m3_fp32(uint32_t const (&a)[4], uint64_t desc_b,
+                                                uint32_t (&acc)[N / 2]) {
+  Qgmma_rfa_e4m3_e4m3_fp32<N>::mma(a, desc_b, acc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// QGMMA e4m3 x e5m2 - 64xNx32 TN with int32 Accumulator with A and B from SMEM
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct Qgmma_e4m3_e5m2_fp32 {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x8x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e4m3_e5m2_fp32<8> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[4]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e5m2\n"
+        "{\n"
+        "  %0, %1, %2, %3, %4, %5, %6, %7\n"
+        "},\n"
+        "  %8, %9, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x32x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e4m3_e5m2_fp32<32> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[16]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n32k32.f32.e4m3.e5m2\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15 \n"
+        "},\n"
+        "  %16, %17, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x64x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e4m3_e5m2_fp32<64> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[32]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e5m2\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31 \n"
+        "},\n"
+        "  %32, %33, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x128x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e4m3_e5m2_fp32<128> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[64]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e5m2\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31,\n"
+        "  %32, %33, %34, %35, %36, %37, %38, %39,\n"
+        "  %40, %41, %42, %43, %44, %45, %46, %47,\n"
+        "  %48, %49, %50, %51, %52, %53, %54, %55,\n"
+        "  %56, %57, %58, %59, %60, %61, %62, %63 \n"
+        "},\n"
+        "  %64, %65, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x160x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e4m3_e5m2_fp32<160> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[80]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n160k32.f32.e4m3.e5m2\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79 \n"
+        "},\n"
+        "  %80, %81, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x192x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e4m3_e5m2_fp32<192> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[96]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n192k32.f32.e4m3.e5m2\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95 \n"
+        "},\n"
+        "  %96, %97, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x256x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e4m3_e5m2_fp32<256> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[128]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n256k32.f32.e4m3.e5m2\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,\n"
+        "   %96,  %97,  %98,  %99, %100, %101, %102, %103,\n"
+        "  %104, %105, %106, %107, %108, %109, %110, %111,\n"
+        "  %112, %113, %114, %115, %116, %117, %118, %119,\n"
+        "  %120, %121, %122, %123, %124, %125, %126, %127 \n"
+        "},\n"
+        "  %128, %129, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95]),
+          "+r"(acc[96]), "+r"(acc[97]), "+r"(acc[98]), "+r"(acc[99]), "+r"(acc[100]),
+          "+r"(acc[101]), "+r"(acc[102]), "+r"(acc[103]), "+r"(acc[104]), "+r"(acc[105]),
+          "+r"(acc[106]), "+r"(acc[107]), "+r"(acc[108]), "+r"(acc[109]), "+r"(acc[110]),
+          "+r"(acc[111]), "+r"(acc[112]), "+r"(acc[113]), "+r"(acc[114]), "+r"(acc[115]),
+          "+r"(acc[116]), "+r"(acc[117]), "+r"(acc[118]), "+r"(acc[119]), "+r"(acc[120]),
+          "+r"(acc[121]), "+r"(acc[122]), "+r"(acc[123]), "+r"(acc[124]), "+r"(acc[125]),
+          "+r"(acc[126]), "+r"(acc[127])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, bool /*ignored*/>
+inline __device__ void qgmma_e4m3_e5m2_fp32(uint64_t desc_a, uint64_t desc_b,
+                                            uint32_t (&acc)[N / 2]) {
+  Qgmma_e4m3_e5m2_fp32<N>::mma(desc_a, desc_b, acc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// QGMMA e4m3 x e5m2 - 64xNx32 TN with int32 Accumulator with A from RF and B from SMEM
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct Qgmma_rfa_e4m3_e5m2_fp32 {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x8x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e4m3_e5m2_fp32<8> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[4]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n8k32.f32.e4m3.e5m2\n"
+        "{\n"
+        "  %0, %1, %2, %3, %4, %5, %6, %7\n"
+        "},\n"
+        "{ %8, %9, %10, %11 }, %12, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x32x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e4m3_e5m2_fp32<32> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[16]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n32k32.f32.e4m3.e5m2\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15 \n"
+        "},\n"
+        "{ %16, %17, %18, %19 }, %20, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x64x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e4m3_e5m2_fp32<64> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[32]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e5m2\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31 \n"
+        "},\n"
+        "{ %32, %33, %34, %35 }, %36, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x128x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e4m3_e5m2_fp32<128> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[64]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n128k32.f32.e4m3.e5m2\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31,\n"
+        "  %32, %33, %34, %35, %36, %37, %38, %39,\n"
+        "  %40, %41, %42, %43, %44, %45, %46, %47,\n"
+        "  %48, %49, %50, %51, %52, %53, %54, %55,\n"
+        "  %56, %57, %58, %59, %60, %61, %62, %63 \n"
+        "},\n"
+        "{ %64, %65, %66, %67 }, %68, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x160x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e4m3_e5m2_fp32<160> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[80]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n160k32.f32.e4m3.e5m2\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79 \n"
+        "},\n"
+        "{ %80, %81, %82, %83 }, %84, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x192x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e4m3_e5m2_fp32<192> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[96]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n192k32.f32.e4m3.e5m2\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95 \n"
+        "},\n"
+        "{ %96, %97, %98, %99 }, %100, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x256x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e4m3_e5m2_fp32<256> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[128]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n256k32.f32.e4m3.e5m2\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,\n"
+        "   %96,  %97,  %98,  %99, %100, %101, %102, %103,\n"
+        "  %104, %105, %106, %107, %108, %109, %110, %111,\n"
+        "  %112, %113, %114, %115, %116, %117, %118, %119,\n"
+        "  %120, %121, %122, %123, %124, %125, %126, %127 \n"
+        "},\n"
+        "{ %128, %129, %130, %131 }, %132, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95]),
+          "+r"(acc[96]), "+r"(acc[97]), "+r"(acc[98]), "+r"(acc[99]), "+r"(acc[100]),
+          "+r"(acc[101]), "+r"(acc[102]), "+r"(acc[103]), "+r"(acc[104]), "+r"(acc[105]),
+          "+r"(acc[106]), "+r"(acc[107]), "+r"(acc[108]), "+r"(acc[109]), "+r"(acc[110]),
+          "+r"(acc[111]), "+r"(acc[112]), "+r"(acc[113]), "+r"(acc[114]), "+r"(acc[115]),
+          "+r"(acc[116]), "+r"(acc[117]), "+r"(acc[118]), "+r"(acc[119]), "+r"(acc[120]),
+          "+r"(acc[121]), "+r"(acc[122]), "+r"(acc[123]), "+r"(acc[124]), "+r"(acc[125]),
+          "+r"(acc[126]), "+r"(acc[127])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, bool /*ignored*/>
+inline __device__ void qgmma_rfa_e4m3_e5m2_fp32(uint32_t const (&a)[4], uint64_t desc_b,
+                                                uint32_t (&acc)[N / 2]) {
+  Qgmma_rfa_e4m3_e5m2_fp32<N>::mma(a, desc_b, acc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// QGMMA e5m2 x e4m3 - 64xNx32 TN with int32 Accumulator with A and B from SMEM
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct Qgmma_e5m2_e4m3_fp32 {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x8x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e5m2_e4m3_fp32<8> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[4]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n8k32.f32.e5m2.e4m3\n"
+        "{\n"
+        "  %0, %1, %2, %3, %4, %5, %6, %7\n"
+        "},\n"
+        "  %8, %9, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x32x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e5m2_e4m3_fp32<32> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[16]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n32k32.f32.e5m2.e4m3\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15 \n"
+        "},\n"
+        "  %16, %17, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x64x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e5m2_e4m3_fp32<64> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[32]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31 \n"
+        "},\n"
+        "  %32, %33, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x128x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e5m2_e4m3_fp32<128> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[64]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n128k32.f32.e5m2.e4m3\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31,\n"
+        "  %32, %33, %34, %35, %36, %37, %38, %39,\n"
+        "  %40, %41, %42, %43, %44, %45, %46, %47,\n"
+        "  %48, %49, %50, %51, %52, %53, %54, %55,\n"
+        "  %56, %57, %58, %59, %60, %61, %62, %63 \n"
+        "},\n"
+        "  %64, %65, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x160x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e5m2_e4m3_fp32<160> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[80]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n160k32.f32.e5m2.e4m3\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79 \n"
+        "},\n"
+        "  %80, %81, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x192x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e5m2_e4m3_fp32<192> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[96]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n192k32.f32.e5m2.e4m3\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95 \n"
+        "},\n"
+        "   %96, %97, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x256x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e5m2_e4m3_fp32<256> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[128]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e4m3\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,\n"
+        "   %96,  %97,  %98,  %99, %100, %101, %102, %103,\n"
+        "  %104, %105, %106, %107, %108, %109, %110, %111,\n"
+        "  %112, %113, %114, %115, %116, %117, %118, %119,\n"
+        "  %120, %121, %122, %123, %124, %125, %126, %127 \n"
+        "},\n"
+        "  %128, %129, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95]),
+          "+r"(acc[96]), "+r"(acc[97]), "+r"(acc[98]), "+r"(acc[99]), "+r"(acc[100]),
+          "+r"(acc[101]), "+r"(acc[102]), "+r"(acc[103]), "+r"(acc[104]), "+r"(acc[105]),
+          "+r"(acc[106]), "+r"(acc[107]), "+r"(acc[108]), "+r"(acc[109]), "+r"(acc[110]),
+          "+r"(acc[111]), "+r"(acc[112]), "+r"(acc[113]), "+r"(acc[114]), "+r"(acc[115]),
+          "+r"(acc[116]), "+r"(acc[117]), "+r"(acc[118]), "+r"(acc[119]), "+r"(acc[120]),
+          "+r"(acc[121]), "+r"(acc[122]), "+r"(acc[123]), "+r"(acc[124]), "+r"(acc[125]),
+          "+r"(acc[126]), "+r"(acc[127])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, bool /*ignored*/>
+inline __device__ void qgmma_e5m2_e4m3_fp32(uint64_t desc_a, uint64_t desc_b,
+                                            uint32_t (&acc)[N / 2]) {
+  Qgmma_e5m2_e4m3_fp32<N>::mma(desc_a, desc_b, acc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// QGMMA e5m2 x e4m3 - 64xNx32 TN with int32 Accumulator with A from RF and B from SMEM
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct Qgmma_rfa_e5m2_e4m3_fp32 {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x8x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e5m2_e4m3_fp32<8> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[4]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n8k32.f32.e5m2.e4m3\n"
+        "{\n"
+        "  %0, %1, %2, %3, %4, %5, %6, %7\n"
+        "},\n"
+        "{ %8, %9, %10, %11 }, %12, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x32x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e5m2_e4m3_fp32<32> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[16]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n32k32.f32.e5m2.e4m3\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15 \n"
+        "},\n"
+        "{ %16, %17, %18, %19 }, %20, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x64x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e5m2_e4m3_fp32<64> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[32]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31 \n"
+        "},\n"
+        "{ %32, %33, %34, %35 }, %36, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x128x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e5m2_e4m3_fp32<128> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[64]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n128k32.f32.e5m2.e4m3\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31,\n"
+        "  %32, %33, %34, %35, %36, %37, %38, %39,\n"
+        "  %40, %41, %42, %43, %44, %45, %46, %47,\n"
+        "  %48, %49, %50, %51, %52, %53, %54, %55,\n"
+        "  %56, %57, %58, %59, %60, %61, %62, %63 \n"
+        "},\n"
+        "{ %64, %65, %66, %67 }, %68, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x160x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e5m2_e4m3_fp32<160> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[80]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n160k32.f32.e5m2.e4m3\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79 \n"
+        "},\n"
+        "{ %80, %81, %82, %83 }, %84, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x192x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e5m2_e4m3_fp32<192> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[96]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n192k32.f32.e5m2.e4m3\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95 \n"
+        "},\n"
+        "{  %96, %97, %98, %99 }, %100, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x256x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e5m2_e4m3_fp32<256> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[128]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e4m3\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,\n"
+        "   %96,  %97,  %98,  %99, %100, %101, %102, %103,\n"
+        "  %104, %105, %106, %107, %108, %109, %110, %111,\n"
+        "  %112, %113, %114, %115, %116, %117, %118, %119,\n"
+        "  %120, %121, %122, %123, %124, %125, %126, %127 \n"
+        "},\n"
+        "{ %128, %129, %130, %131 }, %132, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95]),
+          "+r"(acc[96]), "+r"(acc[97]), "+r"(acc[98]), "+r"(acc[99]), "+r"(acc[100]),
+          "+r"(acc[101]), "+r"(acc[102]), "+r"(acc[103]), "+r"(acc[104]), "+r"(acc[105]),
+          "+r"(acc[106]), "+r"(acc[107]), "+r"(acc[108]), "+r"(acc[109]), "+r"(acc[110]),
+          "+r"(acc[111]), "+r"(acc[112]), "+r"(acc[113]), "+r"(acc[114]), "+r"(acc[115]),
+          "+r"(acc[116]), "+r"(acc[117]), "+r"(acc[118]), "+r"(acc[119]), "+r"(acc[120]),
+          "+r"(acc[121]), "+r"(acc[122]), "+r"(acc[123]), "+r"(acc[124]), "+r"(acc[125]),
+          "+r"(acc[126]), "+r"(acc[127])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, bool /*ignored*/>
+inline __device__ void qgmma_rfa_e5m2_e4m3_fp32(uint32_t const (&a)[4], uint64_t desc_b,
+                                                uint32_t (&acc)[N / 2]) {
+  Qgmma_rfa_e5m2_e4m3_fp32<N>::mma(a, desc_b, acc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// QGMMA e5m2 x e5m2 - 64xNx32 TN with int32 Accumulator with A and B from SMEM
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct Qgmma_e5m2_e5m2_fp32 {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x8x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e5m2_e5m2_fp32<8> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[4]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n8k32.f32.e5m2.e5m2\n"
+        "{\n"
+        "  %0, %1, %2, %3, %4, %5, %6, %7\n"
+        "},\n"
+        "  %8, %9, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x64x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e5m2_e5m2_fp32<64> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[32]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e5m2\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31 \n"
+        "},\n"
+        "  %32, %33, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x128x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e5m2_e5m2_fp32<128> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[64]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n128k32.f32.e5m2.e5m2\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31,\n"
+        "  %32, %33, %34, %35, %36, %37, %38, %39,\n"
+        "  %40, %41, %42, %43, %44, %45, %46, %47,\n"
+        "  %48, %49, %50, %51, %52, %53, %54, %55,\n"
+        "  %56, %57, %58, %59, %60, %61, %62, %63 \n"
+        "},\n"
+        "  %64, %65, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x160x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e5m2_e5m2_fp32<160> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[80]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n160k32.f32.e5m2.e5m2\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79 \n"
+        "},\n"
+        "   %80, %81, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x192x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e5m2_e5m2_fp32<192> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[96]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n192k32.f32.e5m2.e5m2\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95 \n"
+        "},\n"
+        "  %96, %97, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x256x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_e5m2_e5m2_fp32<256> {
+  static inline __device__ void mma(uint64_t desc_a, uint64_t desc_b, uint32_t (&acc)[128]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e5m2\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,\n"
+        "   %96,  %97,  %98,  %99, %100, %101, %102, %103,\n"
+        "  %104, %105, %106, %107, %108, %109, %110, %111,\n"
+        "  %112, %113, %114, %115, %116, %117, %118, %119,\n"
+        "  %120, %121, %122, %123, %124, %125, %126, %127 \n"
+        "},\n"
+        "  %128, %129, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95]),
+          "+r"(acc[96]), "+r"(acc[97]), "+r"(acc[98]), "+r"(acc[99]), "+r"(acc[100]),
+          "+r"(acc[101]), "+r"(acc[102]), "+r"(acc[103]), "+r"(acc[104]), "+r"(acc[105]),
+          "+r"(acc[106]), "+r"(acc[107]), "+r"(acc[108]), "+r"(acc[109]), "+r"(acc[110]),
+          "+r"(acc[111]), "+r"(acc[112]), "+r"(acc[113]), "+r"(acc[114]), "+r"(acc[115]),
+          "+r"(acc[116]), "+r"(acc[117]), "+r"(acc[118]), "+r"(acc[119]), "+r"(acc[120]),
+          "+r"(acc[121]), "+r"(acc[122]), "+r"(acc[123]), "+r"(acc[124]), "+r"(acc[125]),
+          "+r"(acc[126]), "+r"(acc[127])
+        : "l"(desc_a), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, bool /*ignored*/>
+inline __device__ void qgmma_e5m2_e5m2_fp32(uint64_t desc_a, uint64_t desc_b,
+                                            uint32_t (&acc)[N / 2]) {
+  Qgmma_e5m2_e5m2_fp32<N>::mma(desc_a, desc_b, acc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// QGMMA e5m2 x e5m2 - 64xNx32 TN with int32 Accumulator with A from RF and B from SMEM
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct Qgmma_rfa_e5m2_e5m2_fp32 {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x8x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e5m2_e5m2_fp32<8> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[4]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n8k32.f32.e5m2.e5m2\n"
+        "{\n"
+        "  %0, %1, %2, %3, %4, %5, %6, %7\n"
+        "},\n"
+        "{ %8, %9, %10, %11 }, %12, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x32x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e5m2_e5m2_fp32<32> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[16]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n32k32.f32.e5m2.e5m2\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15 \n"
+        "},\n"
+        "{ %16, %17, %18, %19 }, %20, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x64x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e5m2_e5m2_fp32<64> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[32]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e5m2\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31 \n"
+        "},\n"
+        "{ %32, %33, %34, %35 }, %36, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x128x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e5m2_e5m2_fp32<128> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[64]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n128k32.f32.e5m2.e5m2\n"
+        "{\n"
+        "   %0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,\n"
+        "   %8,  %9, %10, %11, %12, %13, %14, %15,\n"
+        "  %16, %17, %18, %19, %20, %21, %22, %23,\n"
+        "  %24, %25, %26, %27, %28, %29, %30, %31,\n"
+        "  %32, %33, %34, %35, %36, %37, %38, %39,\n"
+        "  %40, %41, %42, %43, %44, %45, %46, %47,\n"
+        "  %48, %49, %50, %51, %52, %53, %54, %55,\n"
+        "  %56, %57, %58, %59, %60, %61, %62, %63 \n"
+        "},\n"
+        "{ %64, %65, %66, %67 }, %68, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x160x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e5m2_e5m2_fp32<160> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[80]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n160k32.f32.e5m2.e5m2\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79 \n"
+        "},\n"
+        "{ %80, %81, %82, %83 }, %84, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x192x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e5m2_e5m2_fp32<192> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[96]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n192k32.f32.e5m2.e5m2\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95 \n"
+        "},\n"
+        "{  %96, %97, %98, %99 }, %100, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 64x256x32
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Qgmma_rfa_e5m2_e5m2_fp32<256> {
+  static inline __device__ void mma(uint32_t const (&a)[4], uint64_t desc_b, uint32_t (&acc)[128]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    asm volatile(
+        "wgmma.mma_async.sync.aligned.m64n256k32.f32.e5m2.e5m2\n"
+        "{\n"
+        "    %0,   %1,   %2,   %3,   %4,   %5,   %6,   %7,\n"
+        "    %8,   %9,  %10,  %11,  %12,  %13,  %14,  %15,\n"
+        "   %16,  %17,  %18,  %19,  %20,  %21,  %22,  %23,\n"
+        "   %24,  %25,  %26,  %27,  %28,  %29,  %30,  %31,\n"
+        "   %32,  %33,  %34,  %35,  %36,  %37,  %38,  %39,\n"
+        "   %40,  %41,  %42,  %43,  %44,  %45,  %46,  %47,\n"
+        "   %48,  %49,  %50,  %51,  %52,  %53,  %54,  %55,\n"
+        "   %56,  %57,  %58,  %59,  %60,  %61,  %62,  %63,\n"
+        "   %64,  %65,  %66,  %67,  %68,  %69,  %70,  %71,\n"
+        "   %72,  %73,  %74,  %75,  %76,  %77,  %78,  %79,\n"
+        "   %80,  %81,  %82,  %83,  %84,  %85,  %86,  %87,\n"
+        "   %88,  %89,  %90,  %91,  %92,  %93,  %94,  %95,\n"
+        "   %96,  %97,  %98,  %99, %100, %101, %102, %103,\n"
+        "  %104, %105, %106, %107, %108, %109, %110, %111,\n"
+        "  %112, %113, %114, %115, %116, %117, %118, %119,\n"
+        "  %120, %121, %122, %123, %124, %125, %126, %127 \n"
+        "},\n"
+        "{ %128, %129, %130, %131 }, %132, 1, 1, 1;\n"
+
+        : "+r"(acc[0]), "+r"(acc[1]), "+r"(acc[2]), "+r"(acc[3]), "+r"(acc[4]), "+r"(acc[5]),
+          "+r"(acc[6]), "+r"(acc[7]), "+r"(acc[8]), "+r"(acc[9]), "+r"(acc[10]), "+r"(acc[11]),
+          "+r"(acc[12]), "+r"(acc[13]), "+r"(acc[14]), "+r"(acc[15]), "+r"(acc[16]), "+r"(acc[17]),
+          "+r"(acc[18]), "+r"(acc[19]), "+r"(acc[20]), "+r"(acc[21]), "+r"(acc[22]), "+r"(acc[23]),
+          "+r"(acc[24]), "+r"(acc[25]), "+r"(acc[26]), "+r"(acc[27]), "+r"(acc[28]), "+r"(acc[29]),
+          "+r"(acc[30]), "+r"(acc[31]), "+r"(acc[32]), "+r"(acc[33]), "+r"(acc[34]), "+r"(acc[35]),
+          "+r"(acc[36]), "+r"(acc[37]), "+r"(acc[38]), "+r"(acc[39]), "+r"(acc[40]), "+r"(acc[41]),
+          "+r"(acc[42]), "+r"(acc[43]), "+r"(acc[44]), "+r"(acc[45]), "+r"(acc[46]), "+r"(acc[47]),
+          "+r"(acc[48]), "+r"(acc[49]), "+r"(acc[50]), "+r"(acc[51]), "+r"(acc[52]), "+r"(acc[53]),
+          "+r"(acc[54]), "+r"(acc[55]), "+r"(acc[56]), "+r"(acc[57]), "+r"(acc[58]), "+r"(acc[59]),
+          "+r"(acc[60]), "+r"(acc[61]), "+r"(acc[62]), "+r"(acc[63]), "+r"(acc[64]), "+r"(acc[65]),
+          "+r"(acc[66]), "+r"(acc[67]), "+r"(acc[68]), "+r"(acc[69]), "+r"(acc[70]), "+r"(acc[71]),
+          "+r"(acc[72]), "+r"(acc[73]), "+r"(acc[74]), "+r"(acc[75]), "+r"(acc[76]), "+r"(acc[77]),
+          "+r"(acc[78]), "+r"(acc[79]), "+r"(acc[80]), "+r"(acc[81]), "+r"(acc[82]), "+r"(acc[83]),
+          "+r"(acc[84]), "+r"(acc[85]), "+r"(acc[86]), "+r"(acc[87]), "+r"(acc[88]), "+r"(acc[89]),
+          "+r"(acc[90]), "+r"(acc[91]), "+r"(acc[92]), "+r"(acc[93]), "+r"(acc[94]), "+r"(acc[95]),
+          "+r"(acc[96]), "+r"(acc[97]), "+r"(acc[98]), "+r"(acc[99]), "+r"(acc[100]),
+          "+r"(acc[101]), "+r"(acc[102]), "+r"(acc[103]), "+r"(acc[104]), "+r"(acc[105]),
+          "+r"(acc[106]), "+r"(acc[107]), "+r"(acc[108]), "+r"(acc[109]), "+r"(acc[110]),
+          "+r"(acc[111]), "+r"(acc[112]), "+r"(acc[113]), "+r"(acc[114]), "+r"(acc[115]),
+          "+r"(acc[116]), "+r"(acc[117]), "+r"(acc[118]), "+r"(acc[119]), "+r"(acc[120]),
+          "+r"(acc[121]), "+r"(acc[122]), "+r"(acc[123]), "+r"(acc[124]), "+r"(acc[125]),
+          "+r"(acc[126]), "+r"(acc[127])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "l"(desc_b));
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, bool /*ignored*/>
+inline __device__ void qgmma_rfa_e5m2_e5m2_fp32(uint32_t const (&a)[4], uint64_t desc_b,
+                                                uint32_t (&acc)[N / 2]) {
+  Qgmma_rfa_e5m2_e5m2_fp32<N>::mma(a, desc_b, acc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/hopper/utils_tma.h b/csrc/fmha_v2/fmha/hopper/utils_tma.h
new file mode 100644
index 0000000000..faa63edb81
--- /dev/null
+++ b/csrc/fmha_v2/fmha/hopper/utils_tma.h
@@ -0,0 +1,155 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/hopper/tma_types.h>
+#include <fmha/utils.h>
+
+namespace fmha {
+
+inline __device__ uint32_t elect_one_sync();
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int DIM, cudaTmaDescType DESC_TYPE, bool USE_TMA_MULTICAST>
+inline __device__ void utmaldg(cudaTmaDesc const* p_desc,    // TMA desc
+                               uint32_t smem_ptr,            // desc smem address
+                               uint32_t smem_barrier,        // smem_barrier
+                               int32_t const (&coord)[DIM],  // coord
+                               uint32_t elect_one = 1);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// UTMALDG TILED WITHOUT MULTICAST
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+inline __device__ void utmaldg<2, fmha::cudaTmaDescType::TILED, false>(cudaTmaDesc const* p_desc,
+                                                                       uint32_t smem_ptr,
+                                                                       uint32_t smem_barrier,
+                                                                       int32_t const (&coord)[2],
+                                                                       uint32_t elect_one) {
+  if (elect_one) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    asm volatile(
+        "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes "
+        "[%0], [%1, {%2, %3}], [%4];\n"
+        :
+        : "r"(smem_ptr), "l"(reinterpret_cast<uint64_t>(p_desc)), "r"(coord[0]), "r"(coord[1]),
+          "r"(smem_barrier)
+        : "memory");
+#endif
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+inline __device__ void utmaldg<3, fmha::cudaTmaDescType::TILED, false>(cudaTmaDesc const* p_desc,
+                                                                       uint32_t smem_ptr,
+                                                                       uint32_t smem_barrier,
+                                                                       int32_t const (&coord)[3],
+                                                                       uint32_t elect_one) {
+  if (elect_one) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    asm volatile(
+        "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes "
+        "[%0], [%1, {%2, %3, %4}], [%5];\n"
+        :
+        : "r"(smem_ptr), "l"(reinterpret_cast<uint64_t>(p_desc)), "r"(coord[0]), "r"(coord[1]),
+          "r"(coord[2]), "r"(smem_barrier)
+        : "memory");
+#endif
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// 4D, TILED, without Multicast
+template <>
+inline __device__ void utmaldg<4, fmha::cudaTmaDescType::TILED, false>(cudaTmaDesc const* p_desc,
+                                                                       uint32_t smem_ptr,
+                                                                       uint32_t smem_barrier,
+                                                                       int32_t const (&coord)[4],
+                                                                       uint32_t elect_one) {
+  if (elect_one) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    asm volatile(
+        "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes "
+        "[%0], [%1, {%2, %3, %4, %5}], [%6];\n"
+        :
+        : "r"(smem_ptr), "l"(reinterpret_cast<uint64_t>(p_desc)), "r"(coord[0]), "r"(coord[1]),
+          "r"(coord[2]), "r"(coord[3]), "r"(smem_barrier)
+        : "memory");
+#endif
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// UTMASTG TILED WITHOUT MULTICAST
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int DIM, cudaTmaDescType DESC_TYPE>
+inline __device__ void utmastg(cudaTmaDesc const* p_desc,     // TMA desc
+                               uint32_t smem_ptr,             // src smem address
+                               int32_t const (&coord)[DIM]);  // coord
+
+// 3D, TILED
+template <>
+inline __device__ void utmastg<3, fmha::cudaTmaDescType::TILED>(cudaTmaDesc const* p_desc,
+                                                                uint32_t smem_ptr,
+                                                                const int32_t (&coord)[3]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  asm volatile(
+      "cp.async.bulk.tensor.3d.global.shared::cta.bulk_group [%0, {%1, %2, %3}], [%4];\n" ::"l"(
+          reinterpret_cast<uint64_t>(p_desc)),
+      "r"(coord[0]), "r"(coord[1]), "r"(coord[2]), "r"(smem_ptr)
+      : "memory");
+#endif
+}
+
+// 4D, TILED
+template <>
+inline __device__ void utmastg<4, fmha::cudaTmaDescType::TILED>(cudaTmaDesc const* p_desc,
+                                                                uint32_t smem_ptr,
+                                                                int32_t const (&coord)[4]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  asm volatile(
+      "cp.async.bulk.tensor.4d.global.shared::cta.bulk_group [%0, {%1, %2, %3, %4}], [%5];\n" ::"l"(
+          reinterpret_cast<uint64_t>(p_desc)),
+      "r"(coord[0]), "r"(coord[1]), "r"(coord[2]), "r"(coord[3]), "r"(smem_ptr)
+      : "memory");
+#endif
+}
+
+inline __device__ void tmastg_arrive() {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  asm volatile("cp.async.bulk.commit_group;");
+#else
+  assert(false);
+#endif
+}
+
+inline __device__ void tmastg_wait() {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  asm volatile("cp.async.bulk.wait_group.read %0;" : : "n"(0) : "memory");
+#else
+  assert(false);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/hopper/utils_warpgroup.h b/csrc/fmha_v2/fmha/hopper/utils_warpgroup.h
new file mode 100644
index 0000000000..8923316f61
--- /dev/null
+++ b/csrc/fmha_v2/fmha/hopper/utils_warpgroup.h
@@ -0,0 +1,44 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void warpgroup_arrive() {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+  asm volatile("wgmma.fence.sync.aligned;\n" ::);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void warpgroup_commit() {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+  asm volatile("wgmma.commit_group.sync.aligned;\n" ::);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+inline __device__ void warpgroup_wait() {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL)
+  asm volatile("wgmma.wait_group.sync.aligned %0;\n" ::"n"(N));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/kernel_traits.h b/csrc/fmha_v2/fmha/kernel_traits.h
new file mode 100644
index 0000000000..8e1d5cbb22
--- /dev/null
+++ b/csrc/fmha_v2/fmha/kernel_traits.h
@@ -0,0 +1,879 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/alibi_params.h>
+#include <fmha/fragment.h>
+#include <fmha/gemm.h>
+#include <fmha/gmem_tile_o.h>
+#include <fmha/gmem_tile_o_packed.h>
+#include <fmha/gmem_tile_ps.h>
+#include <fmha/gmem_tile_qkv.h>
+#include <fmha/gmem_tile_qkv_packed.h>
+#include <fmha/smem_tile_o.h>
+#include <fmha/smem_tile_qkv.h>
+#include <fmha/smem_tile_v.h>
+#include <fmha/softmax.h>
+#include <fmha/traits.h>
+#include <fmha/utils.h>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Ada hmma/imma reuses Ampere
+template <typename Traits_>
+struct Traits_reuse {
+  using Traits = Traits_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Traits_reuse<fmha::Ada_hmma_fp16_traits> {
+  using Traits = fmha::Ampere_hmma_fp16_traits;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Traits_reuse<fmha::Ada_hmma_fp32_traits> {
+  using Traits = fmha::Ampere_hmma_fp32_traits;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Traits_reuse<fmha::Ada_imma_int8_int32_traits> {
+  using Traits = fmha::Ampere_imma_int8_int32_traits;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits_p, bool FORCE_EPILOGUE_FP16>
+struct Traits_o_adapter {
+  using Traits = Traits_p;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool FORCE_EPILOGUE_FP16>
+struct Traits_o_adapter<fmha::Volta_hmma_fp16_traits, FORCE_EPILOGUE_FP16> {
+  using Traits = fmha::Volta_hmma_fp16_16x16x16_traits;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// convert to fp16 before smem_o store
+template <>
+struct Traits_o_adapter<fmha::Ampere_hmma_fp32_traits, true> {
+  using Traits = fmha::Ampere_hmma_fp16_traits;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// convert to fp16 before smem_o store
+template <>
+struct Traits_o_adapter<fmha::Turing_hmma_fp32_traits, true> {
+  using Traits = fmha::Turing_hmma_fp16_traits;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// convert to bf16 before smem_o store
+template <>
+struct Traits_o_adapter<fmha::Ampere_hmma_bf16_traits, true> {
+  using Traits = fmha::Ampere_hmma_bf16_bf16_traits;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // Instruction traits.
+    typename Traits_,
+    // The global memory tile for Q, K and V.
+    template <typename, typename, int, int, int, int, bool, bool, int, bool> class Gmem_tile_q_,
+    template <typename, typename, int, int, int, int, bool, bool, int, bool> class Gmem_tile_k_,
+    template <typename, typename, int, int, int, int, bool, bool, int, bool> class Gmem_tile_v_,
+    // The global memory tile for the output.
+    template <typename, typename, int> class Gmem_tile_o_,
+    // Sequence length.
+    int S,
+    // The valid hidden dimension.
+    int VALID_D_,
+    // The valid hidden dimension of V.
+    int VALID_DV_,
+    // The iteration step of the outer loop.
+    int STEP,
+    // The number of vertical warps.
+    int WARPS_M,
+    // The number of horizontal warps.
+    int WARPS_N,
+    // The number of CTAs per head for Cta_tile_p; equivalent to BMM1 split-K
+    int CTAS_PER_HEAD_,
+    // The flags to control the behaviour of LDGs.
+    uint32_t FLAGS,
+    // The version of the kernel.
+    int VERSION_,
+    // The mask version of the kernel
+    int MASK_VERSION_,
+    // Do we use half epilogue for the 2nd GEMM (hmma_fp32)
+    bool BMM2_FP16_EPILOGUE = true,
+    // non-positive means disabled
+    int SAGE_BLOCK_SIZE_Q_ = 0, int SAGE_BLOCK_SIZE_K_ = 0, int SAGE_BLOCK_SIZE_V_ = 0>
+struct Kernel_traits_ {
+  // The instruction traits for the Q*K product.
+  using Traits_p = typename Traits_reuse<Traits_>::Traits;
+  // The instruction traits for the P*V product. Hack to change the traits for Volta HMMA.
+  using Traits_o = typename Traits_o_adapter<Traits_p, false>::Traits;
+  // The instruction traits for the epilogue of the 2nd GEMM. Always use FP16.
+  using Traits_e = typename Traits_o_adapter<Traits_p, BMM2_FP16_EPILOGUE>::Traits;
+
+  // The padded D dimension
+  enum { VALID_D = VALID_D_ };
+
+  enum { D = Next_power_of_two<VALID_D>::VALUE };
+
+  enum { VALID_DV = VALID_DV_ > 0 ? VALID_DV_ : VALID_D };
+
+  enum { DV = Next_power_of_two<VALID_DV>::VALUE };
+
+  enum {
+    SAGE_ATTENTION = SAGE_BLOCK_SIZE_Q_ > 0 || SAGE_BLOCK_SIZE_K_ > 0 || SAGE_BLOCK_SIZE_V_ > 0
+  };
+
+  enum { SAGE_BLOCK_SIZE_Q = SAGE_BLOCK_SIZE_Q_ };
+
+  enum { SAGE_BLOCK_SIZE_K = SAGE_BLOCK_SIZE_K_ };
+
+  enum { SAGE_BLOCK_SIZE_V = SAGE_BLOCK_SIZE_V_ };
+
+  // TODO: expose these tiling params to the interface
+  enum { USE_GRANULAR_TILING = (FLAGS & 0x1000) != 0u };  // TODO ANT: check FLAGS
+
+  using Traits_tile_size =
+      Traits_tile_size<(bool)USE_GRANULAR_TILING, STEP, S, D, DV, Traits_o::K_PER_MMA>;
+
+  enum { CTA_P_TILE_M = Traits_tile_size::CTA_P_TILE_M };
+
+  enum { CTA_P_TILE_N = Traits_tile_size::CTA_P_TILE_N };
+
+  enum { CTA_P_TILE_K = Traits_tile_size::CTA_P_TILE_K };
+
+  enum { CTA_O_TILE_M = Traits_tile_size::CTA_O_TILE_M };
+
+  enum { CTA_O_TILE_N = Traits_tile_size::CTA_O_TILE_N };
+
+  enum { CTA_O_TILE_K = Traits_tile_size::CTA_O_TILE_K };
+
+  // Do we need to reload Q due to splitting the D ?
+  enum { RELOAD_Q = static_cast<int>(CTA_P_TILE_K) != static_cast<int>(D) };
+
+  // The CTA description for the 1st GEMM.
+  using Cta_tile_p =
+      typename Traits_p::template Cta_tile_extd<CTA_P_TILE_M, CTA_P_TILE_N, CTA_P_TILE_K, S,
+                                                VALID_D, WARPS_M, WARPS_N, 1>;
+  // The CTA description for the 2nd GEMM.
+  using Cta_tile_o =
+      typename Traits_o::template Cta_tile_extd<CTA_O_TILE_M, CTA_O_TILE_N, CTA_O_TILE_K, VALID_DV,
+                                                S, WARPS_M, 1, WARPS_N>;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = typename Traits_p::template Mma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = typename Traits_o::template Mma_tile<Cta_tile_o>;
+
+  // Compute the total BMM2_MMAS_K (might not the same as Mma_tile_o::MMAS_K if the granular tiling
+  // is used).
+  static_assert(S % CTA_O_TILE_K == 0, "");
+
+  enum { TOTAL_BMM2_MMAS_K = Mma_tile_o::MMAS_K * (S / CTA_O_TILE_K) };
+
+  // Constraints on the K dimension.
+  static_assert(Mma_tile_p::K_PER_MMA <= static_cast<int>(D));
+  static_assert(Mma_tile_o::K_PER_MMA <= S);
+
+  // The version.
+  enum { VERSION = VERSION_ };
+
+  // The mask version: padding (2), causal (3), sliding_window_causal (4), custom_mask (5).
+  enum { MASK_VERSION = MASK_VERSION_ };
+
+  // Whether use causal mask or not.
+  enum { CAUSAL_MASK = MASK_VERSION_ == 3 || MASK_VERSION_ == 4 };
+
+  // Whether use the sliding window attention or not.
+  enum { SLIDING_WINDOW_ATTENTION = MASK_VERSION_ == 4 };
+
+  // Whether use the custom mask or not.
+  enum { CUSTOM_MASK = MASK_VERSION_ == 5 };
+
+  // Do we use LDGSTS for Q, K or V.
+  enum { USE_LDGSTS_Q = (FLAGS & 0x1u) != 0u };
+
+  enum { USE_LDGSTS_K = (FLAGS & 0x2u) != 0u };
+
+  enum { USE_LDGSTS_V = (FLAGS & 0x4u) != 0u };
+
+  // Do we use one buffer for K and V.
+  enum { SHARE_SMEM_FOR_K_AND_V = (FLAGS & 0x8u) != 0u };
+
+  // Do we use the scale max trick.
+  enum { USE_SCALE_MAX = (FLAGS & 0x10u) != 0u };
+
+  // Are heads in QKV interleaved, i.e. total x h x 3 x d or total x 3 x h x d.
+  enum { HEADS_INTERLEAVED = (FLAGS & 0x20u) == 0u };
+
+  // Keep full K matrix in registers.
+  enum { K_IN_REGS = (FLAGS & 0x40) == 0u };
+
+  // Do we use only 2 fragments or full fragments for frag_q/k (only used by flash attention)
+  enum { LIMIT_QK_FRAGMENTS = ((FLAGS & 0x80u) != 0u && !SHARE_SMEM_FOR_K_AND_V) };
+
+  // Do we use only 2 fragments or full fragments for frag_v (only used by flash attention)
+  enum { LIMIT_V_FRAGMENTS = ((FLAGS & 0x100u) != 0u && !SHARE_SMEM_FOR_K_AND_V) };
+
+  // Limiting QK fragments implies SMEM_K has to reside in SMEM
+  static_assert(!(LIMIT_QK_FRAGMENTS && SHARE_SMEM_FOR_K_AND_V), "");
+
+  // Indicates that kernel does not loop over Q tensor, usually kernel name has _nl suffix
+  enum { NO_LOOP = (FLAGS & 0x200u) != 0u };
+
+  // Are sequences in one batch interleaved. i.e. s x b x ..., or b x s x ...
+  enum { SEQUENCES_INTERLEAVED = (FLAGS & 0x400) != 0u };
+
+  // Use BMM1 softcapping scale or not.
+  enum { ENABLE_BMM1_SOFTCAPPING_SCALE = (FLAGS & 0x800) != 0u };
+
+  // Use MTP (multi-token prediction for MLA kernels) or not.
+  enum { IS_MTP = (FLAGS & 0x2000) != 0u };
+
+  // The number of CTAs per head for Cta_tile_p; equivalent to BMM1 split-K
+  enum { CTAS_PER_HEAD = CTAS_PER_HEAD_ };
+
+  // The number of shared memory buffers to build a software pipeline for Q, K and V.
+  enum {
+    BUFFERS_PER_TILE_SMEM_Q = (USE_GRANULAR_TILING && D > 64) || (USE_LDGSTS_Q && !NO_LOOP) ? 2 : 1
+  };
+
+  enum { BUFFERS_PER_TILE_SMEM_K = USE_GRANULAR_TILING ? 2 : 1 };
+
+  enum { BUFFERS_PER_TILE_SMEM_V = USE_GRANULAR_TILING ? 2 : 1 };
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q = Gmem_tile_q_<Traits_p, Cta_tile_p, Traits_p::BITS_PER_ELEMENT_A, CTA_P_TILE_M,
+                                   CTA_P_TILE_K, VALID_D, USE_LDGSTS_Q, HEADS_INTERLEAVED,
+                                   3,                        // NUM_MATS
+                                   SLIDING_WINDOW_ATTENTION  // Not used.
+                                   >;
+
+  // The shared memory tile to swizzle Q.
+  using Smem_tile_q = fmha::Smem_tile_a<Traits_p, Cta_tile_p, fmha::Row, Gmem_tile_q::BYTES_PER_LDG,
+                                        BUFFERS_PER_TILE_SMEM_Q>;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k = Gmem_tile_k_<Traits_p, Cta_tile_p, Traits_p::BITS_PER_ELEMENT_B, CTA_P_TILE_N,
+                                   CTA_P_TILE_K, VALID_D, USE_LDGSTS_K, HEADS_INTERLEAVED,
+                                   3,  // NUM_MATS
+                                   SLIDING_WINDOW_ATTENTION>;
+
+  // The shared memory tile to swizzle K.
+  using Smem_tile_k = fmha::Smem_tile_b<Traits_p, Cta_tile_p, fmha::Col, Gmem_tile_k::BYTES_PER_LDG,
+                                        BUFFERS_PER_TILE_SMEM_K>;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v = Gmem_tile_v_<Traits_o, Cta_tile_o, Traits_o::BITS_PER_ELEMENT_B, CTA_O_TILE_K,
+                                   CTA_O_TILE_N, VALID_DV, USE_LDGSTS_V, HEADS_INTERLEAVED,
+                                   3,  // NUM_MATS
+                                   SLIDING_WINDOW_ATTENTION>;
+
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = fmha::Smem_tile_v<Traits_o, Cta_tile_o, BUFFERS_PER_TILE_SMEM_V>;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = Gmem_tile_o_<Traits_e, Cta_tile_o, CTAS_PER_HEAD>;
+  // The shared memory tile for O.
+  using Smem_tile_o = fmha::Smem_tile_o<Traits_e, Cta_tile_o>;
+
+  // Make sure the number of threads match.
+  static_assert((int)Gmem_tile_o::THREADS_PER_ROW == (int)Smem_tile_o::THREADS_PER_ROW, "");
+
+  // The number of threads.
+  enum { THREADS = Cta_tile_p::THREADS_PER_CTA };
+
+  // Make sure the number of threads matches both CTAs.
+  static_assert((int)THREADS == (int)Cta_tile_o::THREADS_PER_CTA, "");
+
+  // The amount of shared memory needed to load Q and K.
+  enum { BYTES_PER_SMEM_QK = Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE };
+
+  // The extra amount of shared memory needed to load V.
+  enum { BYTES_PER_SMEM_V = SHARE_SMEM_FOR_K_AND_V ? 0u : Smem_tile_v::BYTES_PER_TILE };
+
+  // The amount of shared memory needed for Q, K and V..
+  enum { BYTES_PER_SMEM_QKV = BYTES_PER_SMEM_QK + BYTES_PER_SMEM_V };
+
+  // The amount of shared memory needed to load/store O.
+  enum { BYTES_PER_SMEM_O = Smem_tile_o::BYTES_PER_TILE };
+
+  // The amount of shared memory needed to load Q and store O.
+  enum {
+    BYTES_PER_SMEM_QO =
+        NO_LOOP ? Smem_tile_o::BYTES_PER_TILE : Smem_tile_q::BYTES_PER_TILE + BYTES_PER_SMEM_O
+  };
+
+  // The amount of shared memory needed for Q, K, V and O.
+  enum { BYTES_PER_SMEM = fmha::Max<BYTES_PER_SMEM_QKV, BYTES_PER_SMEM_QO>::VALUE };
+
+  // Make sure we have enough shared memory.
+  static_assert((NO_LOOP
+                     ? Smem_tile_o::BYTES_PER_TILE
+                     : Smem_tile_q::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE) <= BYTES_PER_SMEM,
+                "");
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // Instruction traits.
+    typename Traits_,
+    // The global memory tile for Q, K and V.
+    template <typename, typename, int, int, int, bool, bool, int> class Gmem_tile_q_,
+    // The global memory tile for the output.
+    template <typename, typename, int> class Gmem_tile_o_,
+    // Sequence length for K/V.
+    int S_KV,
+    // The hidden dimension.
+    int D,
+    // The iteration step of the outer loop.
+    int STEP,
+    // The number of vertical warps.
+    int WARPS_M,
+    // The number of horizontal warps.
+    int WARPS_N,
+    // The number of CTAs per head for Cta_tile_p; equivalent to BMM1 split-K
+    int CTAS_PER_HEAD_,
+    // The flags to control the behaviour of LDGs.
+    uint32_t FLAGS,
+    // The version of the kernel.
+    int VERSION_,
+    // Do we use half epilogue for the 2nd GEMM (hmma_fp32)
+    bool BMM2_FP16_EPILOGUE = true>
+struct Kernel_traits_fmhca_ {
+  // The instruction traits for the Q*K product.
+  using Traits_p = typename Traits_reuse<Traits_>::Traits;
+  // The instruction traits for the P*V product. Hack to change the traits for Volta HMMA.
+  using Traits_o = typename Traits_o_adapter<Traits_p, false>::Traits;
+  // The instruction traits for the epilogue of the 2nd GEMM. Always use FP16.
+  using Traits_e = typename Traits_o_adapter<Traits_p, BMM2_FP16_EPILOGUE>::Traits;
+
+  // The CTA description for the 1st GEMM.
+  using Cta_tile_p =
+      typename Traits_p::template Cta_tile_extd<STEP, S_KV, D, S_KV, D, WARPS_M, WARPS_N, 1>;
+  // The CTA description for the 2nd GEMM.
+  using Cta_tile_o =
+      typename Traits_o::template Cta_tile_extd<STEP, D, S_KV, D, S_KV, WARPS_M, 1, WARPS_N>;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = typename Traits_p::template Mma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = typename Traits_o::template Mma_tile<Cta_tile_o>;
+
+  // Constraints on the K dimension.
+  static_assert(Mma_tile_p::K_PER_MMA <= D, "");
+  static_assert(Mma_tile_o::K_PER_MMA <= S_KV, "");
+
+  // The version.
+  enum { VERSION = VERSION_ };
+
+  // The mask version
+  enum { MASK_VERSION = VERSION_ };
+
+  // Whether use causal mask or not.
+  enum { CAUSAL_MASK = MASK_VERSION >= 3 };
+
+  // Whether use the sliding window attention or not.
+  enum { SLIDING_WINDOW_ATTENTION = MASK_VERSION == 4 };
+
+  // Do we use LDGSTS for Q, K or V.
+  enum { USE_LDGSTS_Q = (FLAGS & 0x1u) != 0u };
+
+  enum { USE_LDGSTS_K = (FLAGS & 0x2u) != 0u };
+
+  enum { USE_LDGSTS_V = (FLAGS & 0x4u) != 0u };
+
+  // Do we use one buffer for K and V.
+  enum { SHARE_SMEM_FOR_K_AND_V = (FLAGS & 0x8u) != 0u };
+
+  // Do we use the scale max trick.
+  enum { USE_SCALE_MAX = (FLAGS & 0x10u) != 0u };
+
+  // Are heads in QKV interleaved, i.e. total x h x 3 x d or total x 3 x h x d.
+  enum { HEADS_INTERLEAVED = (FLAGS & 0x20u) == 0u };
+
+  // Keep full K matrix in registers.
+  enum { K_IN_REGS = (FLAGS & 0x40) == 0u };
+
+  // Use BMM1 softcapping scale or not.
+  enum { ENABLE_BMM1_SOFTCAPPING_SCALE = 0 };
+
+  // The number of CTAs per head for Cta_tile_p; equivalent to BMM1 split-K
+  enum { CTAS_PER_HEAD = CTAS_PER_HEAD_ };
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q = Gmem_tile_q_<Traits_p, Cta_tile_p, Traits_p::BITS_PER_ELEMENT_A, STEP, D,
+                                   USE_LDGSTS_Q, HEADS_INTERLEAVED,
+                                   1  // NUM_MATS
+                                   >;
+
+  // The shared memory tile to swizzle Q.
+  using Smem_tile_q = fmha::Smem_tile_a<Traits_p, Cta_tile_p, fmha::Row, Gmem_tile_q::BYTES_PER_LDG,
+                                        USE_LDGSTS_Q ? 2 : 1>;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k = Gmem_tile_q_<Traits_p, Cta_tile_p, Traits_p::BITS_PER_ELEMENT_B, S_KV, D,
+                                   USE_LDGSTS_K, HEADS_INTERLEAVED,
+                                   2  // NUM_MATS
+                                   >;
+
+  // The shared memory tile to swizzle K.
+  using Smem_tile_k = fmha::Smem_tile_b<Traits_p, Cta_tile_p, fmha::Col>;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v = Gmem_tile_q_<Traits_o, Cta_tile_o, Traits_o::BITS_PER_ELEMENT_B, S_KV, D,
+                                   USE_LDGSTS_V, HEADS_INTERLEAVED,
+                                   2  // NUM_MATS
+                                   >;
+
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = fmha::Smem_tile_v<Traits_o, Cta_tile_o>;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = Gmem_tile_o_<Traits_e, Cta_tile_o, CTAS_PER_HEAD>;
+  // The shared memory tile for O.
+  using Smem_tile_o = fmha::Smem_tile_o<Traits_e, Cta_tile_o>;
+
+  // Make sure the number of threads match.
+  static_assert((int)Gmem_tile_o::THREADS_PER_ROW == (int)Smem_tile_o::THREADS_PER_ROW, "");
+
+  // The number of threads.
+  enum { THREADS = Cta_tile_p::THREADS_PER_CTA };
+
+  // Make sure the number of threads matches both CTAs.
+  static_assert((int)THREADS == (int)Cta_tile_o::THREADS_PER_CTA, "");
+
+  // The amount of shared memory needed to load Q and K.
+  enum { BYTES_PER_SMEM_QK = Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE };
+
+  // The extra amount of shared memory needed to load V.
+  enum { BYTES_PER_SMEM_V = SHARE_SMEM_FOR_K_AND_V ? 0u : Smem_tile_v::BYTES_PER_TILE };
+
+  // The amount of shared memory needed for Q, K and V..
+  enum { BYTES_PER_SMEM_QKV = BYTES_PER_SMEM_QK + BYTES_PER_SMEM_V };
+
+  // The amount of shared memory needed to load Q and store O.
+  enum { BYTES_PER_SMEM_QO = Smem_tile_q::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE };
+
+  // The amount of shared memory needed for Q, K, V and O.
+  enum { BYTES_PER_SMEM = fmha::Max<BYTES_PER_SMEM_QKV, BYTES_PER_SMEM_QO>::VALUE };
+
+  // Make sure we have enough shared memory.
+  static_assert(Smem_tile_q::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE <= BYTES_PER_SMEM, "");
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits_,
+    // The sequence length.
+    int S,
+    // The hidden size per head.
+    int VALID_D,
+    // The number of timesteps per iteration of the main loop.
+    int STEP,
+    // The number of vertical warps.
+    int WARPS_M,
+    // The number of horizontal warps.
+    int WARPS_N,
+    // The number of CTAs per head for Cta_tile_p; equivalent to BMM1 split-K
+    int CTAS_PER_HEAD_,
+    // The flags.
+    uint32_t FLAGS = 0x8,
+    // The mask version of the kernel
+    int MASK_VERSION_ = 2>
+struct Kernel_traits_interleaved_v2_ {
+  // The instruction traits.
+  using Traits = typename Traits_reuse<Traits_>::Traits;
+  using Traits_p = Traits;
+  using Traits_o = Traits;
+
+  // The padded D dimension
+  enum { D = Next_power_of_two<VALID_D>::VALUE };
+
+  // The CTA description for the 1st GEMM.
+  using Cta_tile_p =
+      typename Traits::template Cta_tile_extd<STEP, S, D, S, VALID_D, WARPS_M, WARPS_N, 1>;
+  // The CTA description for the 2nd GEMM.
+  using Cta_tile_o =
+      typename Traits::template Cta_tile_extd<STEP, D, S, VALID_D, S, WARPS_M, 1, WARPS_N>;
+
+  // The version.
+  enum { VERSION = 2 };
+
+  enum { MASK_VERSION = MASK_VERSION_ };
+
+  // Whether use causal mask or not.
+  enum { CAUSAL_MASK = MASK_VERSION_ >= 3 };
+
+  // Whether use the sliding window attention or not.
+  enum { SLIDING_WINDOW_ATTENTION = MASK_VERSION_ == 4 };
+
+  // The number of CTAs per head for Cta_tile_p; equivalent to BMM1 split-K
+  enum { CTAS_PER_HEAD = CTAS_PER_HEAD_ };
+
+  // Do we use LDGSTS for Q, K or V.
+  enum { USE_LDGSTS_Q = (FLAGS & 0x1u) != 0u };
+
+  enum { USE_LDGSTS_K = (FLAGS & 0x2u) != 0u };
+
+  enum { USE_LDGSTS_V = (FLAGS & 0x4u) != 0u };
+
+  // Do we use one buffer for K and V.
+  enum { SHARE_SMEM_FOR_K_AND_V = (FLAGS & 0x8u) != 0u };
+
+  // Do we use the scale max trick.
+  enum { USE_SCALE_MAX = (FLAGS & 16) != 0u };
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q =
+      fmha::v2::Gmem_tile_qkv_interleaved<Traits, Cta_tile_p, Traits::BITS_PER_ELEMENT_A, STEP, D,
+                                          USE_LDGSTS_Q>;
+  // The shared memory tile to swizzle Q.
+  using Smem_tile_q = fmha::Smem_tile_qk_interleaved_a<Traits, Cta_tile_p>;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k =
+      fmha::v2::Gmem_tile_qkv_interleaved<Traits, Cta_tile_p, Traits::BITS_PER_ELEMENT_B, S, D,
+                                          USE_LDGSTS_K>;
+  // The shared memory tile to swizzle K.
+  using Smem_tile_k = fmha::Smem_tile_qk_interleaved_b<Traits, Cta_tile_p>;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v =
+      fmha::v2::Gmem_tile_qkv_interleaved<Traits, Cta_tile_o, Traits::BITS_PER_ELEMENT_B, S, D,
+                                          USE_LDGSTS_V>;
+
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = fmha::Smem_tile_v_interleaved_b<Traits, Cta_tile_o>;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = fmha::v2::Imma_gmem_tile_o_interleaved<Traits, Cta_tile_o, CTAS_PER_HEAD>;
+  // The shared memory tile for O.
+  using Smem_tile_o = fmha::Smem_tile_o_interleaved<Traits, Cta_tile_o>;
+
+  // Make sure the number of threads match.
+  static_assert((int)Gmem_tile_o::THREADS_PER_ROW == (int)Smem_tile_o::THREADS_PER_ROW, "");
+
+  // The number of threads.
+  enum { THREADS = Cta_tile_p::THREADS_PER_CTA };
+
+  // Make sure the number of threads matches both CTAs.
+  static_assert((int)THREADS == (int)Cta_tile_o::THREADS_PER_CTA, "");
+
+  // The amount of shared memory needed to load Q and K.
+  enum { BYTES_PER_SMEM_QK = Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE };
+
+  // The extra amount of shared memory needed to load V.
+  enum { BYTES_PER_SMEM_V = SHARE_SMEM_FOR_K_AND_V ? 0u : Smem_tile_v::BYTES_PER_TILE };
+
+  // The amount of shared memory needed for Q, K and V..
+  enum { BYTES_PER_SMEM_QKV = BYTES_PER_SMEM_QK + BYTES_PER_SMEM_V };
+
+  // The amount of shared memory needed to load Q and store O.
+  enum { BYTES_PER_SMEM_QO = Smem_tile_q::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE };
+
+  // The amount of shared memory needed for Q, K, V and O.
+  enum { BYTES_PER_SMEM = fmha::Max<BYTES_PER_SMEM_QKV, BYTES_PER_SMEM_QO>::VALUE };
+
+  // Make sure we have enough shared memory.
+  static_assert(Smem_tile_q::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE <= BYTES_PER_SMEM, "");
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits_,
+    // The sequence length.
+    int S,
+    // The hidden size per head.
+    int VALID_D,
+    // The number of timesteps per iteration of the main loop.
+    int STEP,
+    // The number of vertical warps.
+    int WARPS_M,
+    // The number of horizontal warps.
+    int WARPS_N,
+    // The number of CTAs per head for Cta_tile_p; equivalent to BMM1 split-K
+    int CTAS_PER_HEAD_,
+    // The flags.
+    uint32_t FLAGS = 0x8,
+    // The mask version of the kernel
+    int MASK_VERSION_ = 2>
+using Kernel_traits_interleaved_v2 =
+    Kernel_traits_interleaved_v2_<Traits_, S, VALID_D, STEP, WARPS_M, WARPS_N, CTAS_PER_HEAD_,
+                                  FLAGS, MASK_VERSION_>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The sequence length.
+    int S,
+    // The hidden size per head.
+    int D,
+    // The number of timesteps per iteration of the main loop.
+    int STEP,
+    // The number of vertical warps.
+    int WARPS_M,
+    // The number of horizontal warps.
+    int WARPS_N,
+    // The number of CTAs per head for Cta_tile_p; equivalent to BMM1 split-K
+    int CTAS_PER_HEAD,
+    // The flags.
+    uint32_t FLAGS = 0x8>
+using Kernel_traits_v1 = Kernel_traits_<Traits, fmha::v1::Gmem_tile_qkv, fmha::v1::Gmem_tile_qkv,
+                                        fmha::v1::Gmem_tile_qkv, fmha::v1::Gmem_tile_o, S, D, 0,
+                                        STEP, WARPS_M, WARPS_N, CTAS_PER_HEAD, FLAGS, 1, 1>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The sequence length.
+    int S,
+    // The hidden size per head.
+    int D,
+    // The number of timesteps per iteration of the main loop.
+    int STEP,
+    // The number of vertical warps.
+    int WARPS_M,
+    // The number of horizontal warps.
+    int WARPS_N,
+    // The number of CTAs per head for Cta_tile_p; equivalent to BMM1 split-K
+    int CTAS_PER_HEAD,
+    // The flags.
+    uint32_t FLAGS = 0x8>
+using Kernel_traits_v1_causal_mask =
+    Kernel_traits_<Traits, fmha::v1::Gmem_tile_qkv, fmha::v1::Gmem_tile_qkv,
+                   fmha::v1::Gmem_tile_qkv, fmha::v1::Gmem_tile_o, S, D, 0, STEP, WARPS_M, WARPS_N,
+                   CTAS_PER_HEAD, FLAGS,
+                   1,   // VERSION_
+                   3>;  // MASK_VERSION_
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits_, typename OutputType>
+struct Gmem_tile_o_dispatcher {
+  template <typename Traits, typename Cta_tile, int CTAS_PER_HEAD>
+  using Gmem_tile_o = fmha::v2::Gmem_tile_o<Traits, Cta_tile, CTAS_PER_HEAD>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Gmem_tile_o_dispatcher<fmha::Ada_qmma_e4m3_fp32_traits, uint16_t> {
+  template <typename Traits, typename Cta_tile, int CTAS_PER_HEAD>
+  using Gmem_tile_o = fmha::v2::Gmem_tile_o_uint16<Traits, Cta_tile, CTAS_PER_HEAD>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Gmem_tile_o_dispatcher<fmha::Ada_qmma_e4m3_fp32_traits, nv_bfloat16> {
+  template <typename Traits, typename Cta_tile, int CTAS_PER_HEAD>
+  using Gmem_tile_o = fmha::v2::Gmem_tile_o_bfloat16<Traits, Cta_tile, CTAS_PER_HEAD>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The sequence length.
+    int S,
+    // The hidden size per head.
+    int D,
+    // The hidden dimension of V.
+    int DV,
+    // The number of timesteps per iteration of the main loop.
+    int STEP,
+    // The number of vertical warps.
+    int WARPS_M,
+    // The number of horizontal warps.
+    int WARPS_N,
+    // The number of CTAs per head for Cta_tile_p; equivalent to BMM1 split-K
+    int CTAS_PER_HEAD,
+    // The flags.
+    uint32_t FLAGS = 0x8,
+    // The attention mask version (see src/mask.h).
+    int MASK_VERSION = 2,
+    // Do we use half epilogue for the 2nd GEMM (hmma_fp32)
+    bool BMM2_FP16_EPILOGUE = true,
+    // The output type.
+    typename OutputType = typename Traits::A_type,
+    // The sage attention block size for Q, K and V
+    int SAGE_BLOCK_SIZE_Q = 0, int SAGE_BLOCK_SIZE_K = 0, int SAGE_BLOCK_SIZE_V = 0>
+using Kernel_traits_v2 =
+    Kernel_traits_<Traits, fmha::v2::Gmem_tile_qkv, fmha::v2::Gmem_tile_qkv,
+                   fmha::v2::Gmem_tile_qkv, Gmem_tile_o_dispatcher<Traits, OutputType>::Gmem_tile_o,
+                   S, D, DV, STEP, WARPS_M, WARPS_N, CTAS_PER_HEAD, FLAGS, 2, MASK_VERSION,
+                   BMM2_FP16_EPILOGUE, SAGE_BLOCK_SIZE_Q, SAGE_BLOCK_SIZE_K, SAGE_BLOCK_SIZE_V>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The sequence length.
+    int S,
+    // The hidden size per head.
+    int D,
+    // The hidden dimension of V.
+    int DV,
+    // The number of timesteps per iteration of the main loop.
+    int STEP,
+    // The number of vertical warps.
+    int WARPS_M,
+    // The number of horizontal warps.
+    int WARPS_N,
+    // The number of CTAs per head for Cta_tile_p; equivalent to BMM1 split-K
+    int CTAS_PER_HEAD,
+    // The flags.
+    uint32_t FLAGS = 0x8,
+    // The attention mask version (see src/mask.h).
+    int MASK_VERSION = 2,
+    // Do we use half epilogue for the 2nd GEMM (hmma_fp32)
+    bool BMM2_FP16_EPILOGUE = true,
+    // The output type.
+    typename OutputType = typename Traits::A_type,
+    // The sage attention block size for Q, K and V
+    int SAGE_BLOCK_SIZE_Q = 0, int SAGE_BLOCK_SIZE_K = 0, int SAGE_BLOCK_SIZE_V = 0>
+using Kernel_traits_v2_q_k_v =
+    Kernel_traits_<Traits, fmha::v2::Gmem_tile_q_k_v, fmha::v2::Gmem_tile_q_k_v,
+                   fmha::v2::Gmem_tile_q_k_v,
+                   Gmem_tile_o_dispatcher<Traits, OutputType>::Gmem_tile_o, S, D, DV, STEP, WARPS_M,
+                   WARPS_N, CTAS_PER_HEAD, FLAGS, 2, MASK_VERSION, BMM2_FP16_EPILOGUE,
+                   SAGE_BLOCK_SIZE_Q, SAGE_BLOCK_SIZE_K, SAGE_BLOCK_SIZE_V>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The sequence length.
+    int S,
+    // The hidden size per head.
+    int D,
+    // The hidden dimension of V.
+    int DV,
+    // The number of timesteps per iteration of the main loop.
+    int STEP,
+    // The number of vertical warps.
+    int WARPS_M,
+    // The number of horizontal warps.
+    int WARPS_N,
+    // The number of CTAs per head for Cta_tile_p; equivalent to BMM1 split-K
+    int CTAS_PER_HEAD,
+    // The flags.
+    uint32_t FLAGS = 0x8,
+    // The attention mask version (see src/mask.h).
+    int MASK_VERSION = 2,
+    // Do we use half epilogue for the 2nd GEMM (hmma_fp32)
+    bool BMM2_FP16_EPILOGUE = true,
+    // The output type.
+    typename OutputType = typename Traits::A_type,
+    // The sage attention block size for Q, K and V
+    int SAGE_BLOCK_SIZE_Q = 0, int SAGE_BLOCK_SIZE_K = 0, int SAGE_BLOCK_SIZE_V = 0>
+using Kernel_traits_v2_paged_kv_cache =
+    Kernel_traits_<Traits, fmha::v2::Gmem_tile_q_k_v, fmha::v2::Gmem_tile_paged_kv,
+                   fmha::v2::Gmem_tile_paged_kv,
+                   Gmem_tile_o_dispatcher<Traits, OutputType>::Gmem_tile_o, S, D, DV, STEP, WARPS_M,
+                   WARPS_N, CTAS_PER_HEAD, FLAGS, 2, MASK_VERSION, BMM2_FP16_EPILOGUE,
+                   SAGE_BLOCK_SIZE_Q, SAGE_BLOCK_SIZE_K, SAGE_BLOCK_SIZE_V>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The sequence length.
+    int S,
+    // The hidden size per head.
+    int D,
+    // The hidden dimension of V.
+    int DV,
+    // The number of timesteps per iteration of the main loop.
+    int STEP,
+    // The number of vertical warps.
+    int WARPS_M,
+    // The number of horizontal warps.
+    int WARPS_N,
+    // The number of CTAs per head for Cta_tile_p; equivalent to BMM1 split-K
+    int CTAS_PER_HEAD,
+    // The flags.
+    uint32_t FLAGS = 0x8,
+    // The attention mask version (see src/mask.h).
+    int MASK_VERSION = 2,
+    // Do we use half epilogue for the 2nd GEMM (hmma_fp32)
+    bool BMM2_FP16_EPILOGUE = true,
+    // The output type.
+    typename OutputType = typename Traits::A_type,
+    // The sage attention block size for Q, K and V
+    int SAGE_BLOCK_SIZE_Q = 0, int SAGE_BLOCK_SIZE_K = 0, int SAGE_BLOCK_SIZE_V = 0>
+using Kernel_traits_v2_contiguous_kv_cache =
+    Kernel_traits_<Traits, fmha::v2::Gmem_tile_q_k_v, fmha::v2::Gmem_tile_contiguous_kv,
+                   fmha::v2::Gmem_tile_contiguous_kv,
+                   Gmem_tile_o_dispatcher<Traits, OutputType>::Gmem_tile_o, S, D, 0, STEP, WARPS_M,
+                   WARPS_N, CTAS_PER_HEAD, FLAGS, 2, MASK_VERSION, BMM2_FP16_EPILOGUE,
+                   SAGE_BLOCK_SIZE_Q, SAGE_BLOCK_SIZE_K, SAGE_BLOCK_SIZE_V>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The sequence length for K and V.
+    int S_KV,
+    // The hidden size per head.
+    int D,
+    // The number of timesteps per iteration of the main loop.
+    int STEP,
+    // The number of vertical warps.
+    int WARPS_M,
+    // The number of horizontal warps.
+    int WARPS_N,
+    // The number of CTAs per head for Cta_tile_p; equivalent to BMM1 split-K
+    int CTAS_PER_HEAD,
+    // The flags.
+    uint32_t FLAGS = 0x8>
+using Kernel_traits_fmhca =
+    Kernel_traits_fmhca_<Traits, fmha::v2::Gmem_tile_q_kv, fmha::v2::Gmem_tile_o, S_KV, D, STEP,
+                         WARPS_M, WARPS_N, CTAS_PER_HEAD, FLAGS, 2>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/mask.h b/csrc/fmha_v2/fmha/mask.h
new file mode 100644
index 0000000000..3219947ccf
--- /dev/null
+++ b/csrc/fmha_v2/fmha/mask.h
@@ -0,0 +1,785 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include "fmha/traits.h"
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int FMHA_VERSION>
+struct Mask {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile>
+struct Mask<Traits, Cta_tile, 1> {
+  // The shape of the MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of MMAs in each dimension.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  enum { MMAS_N = Mma_tile::MMAS_N };
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Mask(Params const& params, Block_info const& block_info, int tidx) {
+    // The pointer.
+    packed_mask_ptr_ = reinterpret_cast<char const*>(params.packed_mask_ptr);
+    // Take the head into account.
+    packed_mask_ptr_ += block_info.bidb * params.packed_mask_stride_in_bytes;
+    // The thread inside the CTA.
+    packed_mask_ptr_ += tidx * sizeof(uint32_t);
+  }
+
+  // Load the mask into registers (and expand).
+  inline __device__ void load(int it) {
+    // One 32-bit integer per MMA.
+    uint32_t packed_mask[MMAS_M];
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      int offset = (it * MMAS_M + mi) * Cta_tile::THREADS_PER_CTA * sizeof(uint32_t);
+      fmha::ldg(packed_mask[mi], packed_mask_ptr_ + offset);
+    }
+
+// Expand the mask.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        mask_[2 * mi + 0][4 * ni + 0] = packed_mask[mi] & (1u << (8 * ni + 0));
+        mask_[2 * mi + 0][4 * ni + 1] = packed_mask[mi] & (1u << (8 * ni + 1));
+        mask_[2 * mi + 1][4 * ni + 0] = packed_mask[mi] & (1u << (8 * ni + 2));
+        mask_[2 * mi + 1][4 * ni + 1] = packed_mask[mi] & (1u << (8 * ni + 3));
+        mask_[2 * mi + 0][4 * ni + 2] = packed_mask[mi] & (1u << (8 * ni + 4));
+        mask_[2 * mi + 0][4 * ni + 3] = packed_mask[mi] & (1u << (8 * ni + 5));
+        mask_[2 * mi + 1][4 * ni + 2] = packed_mask[mi] & (1u << (8 * ni + 6));
+        mask_[2 * mi + 1][4 * ni + 3] = packed_mask[mi] & (1u << (8 * ni + 7));
+      }
+    }
+  }
+
+  // Is a given position valid?
+  inline __device__ bool is_valid(int mi, int ni, int ii, int jj) const {
+    return mask_[mi * 2 + ii][ni * 4 + jj];
+  }
+
+  // The pointer to the mask.
+  char const* packed_mask_ptr_;
+  // The mask after expansion.
+  bool mask_[MMAS_M * 2][MMAS_N * 4];
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Mask<Volta_hmma_fp16_traits, Cta_tile, 1> {
+  // The instruction traits.
+  using Traits = Volta_hmma_fp16_traits;
+  // The shape of the MMA tile.
+  using Mma_tile = typename Traits::Mma_tile<Cta_tile>;
+
+  // The number of MMAs in each dimension.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  enum { MMAS_N = Mma_tile::MMAS_N };
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Mask(Params const& params, Block_info const& block_info, int tidx) {
+    // The pointer.
+    packed_mask_ptr_ = reinterpret_cast<char const*>(params.packed_mask_ptr);
+    // Take the head into account.
+    packed_mask_ptr_ += block_info.bidb * params.packed_mask_stride_in_bytes;
+    // The thread inside the CTA.
+    packed_mask_ptr_ += tidx * sizeof(uint32_t);
+  }
+
+  // Load the mask into registers (and expand).
+  inline __device__ void load(int it) {
+    // One 32-bit integer per MMA.
+    uint32_t packed_mask[MMAS_M];
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      int offset = (it * MMAS_M + mi) * Cta_tile::THREADS_PER_CTA * sizeof(uint32_t);
+      fmha::ldg(packed_mask[mi], packed_mask_ptr_ + offset);
+    }
+
+// Expand the mask.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ii = 0; ii < MMAS_N * 8; ++ii) {
+        mask_[mi][ii] = packed_mask[mi] & (1u << ii);
+      }
+    }
+  }
+
+  // Is a given position valid?
+  inline __device__ bool is_valid(int mi, int ni, int, int jj) const {
+    return mask_[mi][ni * 8 + jj];
+  }
+
+  // The pointer to the mask.
+  char const* packed_mask_ptr_;
+  // The mask after expansion.
+  bool mask_[MMAS_M][MMAS_N * 8];
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile>
+struct Mask<Traits, Cta_tile, 2> {
+  // That implementation works only when WARPS_K is 1.
+  static_assert(Cta_tile::WARPS_K == 1, "");
+
+  // The shape of the MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Mask(Params const& params, Block_info const& block_info, int tidx)
+      : seqlen_(block_info.actual_seqlen), col_loop_step_(0) {
+    // The decomposition of the thread index into warp/lane.
+    int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+    // The position of the warp.
+    int warp_n = warp / Cta_tile::WARPS_M;
+    // The position of the thread.
+    col_ = block_info.bidn * Cta_tile::N + warp_n * 16 + lane % 4 * 2;
+    col_init_ = col_;
+  }
+
+  // Is a given position valid?
+  inline __device__ bool is_valid(int, int ni, int, int jj) const {
+    // The position of the thread in the sequence.
+    int offset = this->col_ + this->col_loop_step_ * Cta_tile::N + ni * Mma_tile::N_PER_MMA_PER_CTA;
+    // The position inside the MMA.
+    offset += (jj & 0x02) * 4 + (jj & 0x1);
+    // Is it a valid position in the sequence?
+    return offset < seqlen_;
+  }
+
+  // BERT Mask: if upper left is invalid, none are valid
+  inline __device__ bool any_valid(int mi, int ni) const { return is_valid(mi, ni, 0, 0); }
+
+  // Move mask to next tile (flash attention)
+  inline __device__ void move() { this->col_ += Cta_tile::N; }
+
+  // Move mask the col by offset (flash attention)
+  inline __device__ void move_to_offset(int offset) { this->col_ = col_init_ + offset; }
+
+  // Reset mask to the initial col
+  inline __device__ void reset() { col_ = col_init_; }
+
+  // Load the mask... Nothing to do for real.
+  inline __device__ void load(int) {}
+
+  // Load the mask... we use it to keep track of to row, col (flash attention).
+  inline __device__ void load(int, int col_loop_step) { col_loop_step_ = col_loop_step; }
+
+  // The length of the sequence.
+  int seqlen_;
+  // The left-most position of the thread in the sequence.
+  int col_, col_init_;
+  // The current col iteration
+  int col_loop_step_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile>
+struct Mask<Traits, Cta_tile, 3> : public Mask<Traits, Cta_tile, 2> {
+  // V3 mask is the causal mask (e.g. for GPT) and extends V2 masks (self-attention).
+  using Base = Mask<Traits, Cta_tile, 2>;
+
+  // The shape of the MMA tile.
+  using Mma_tile = typename Base::Mma_tile;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Mask(Params const& params, Block_info const& block_info, int tidx)
+      : Base(params, block_info, tidx), row_loop_step_(0) {
+    int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+    // The position of the warp.
+    int warp_m = warp % Cta_tile::WARPS_M;
+    row_ = warp_m * 16 + lane / 4;
+  }
+
+  inline __device__ void get_row_col(int& row, int& col, int mi, int ni, int ii, int jj) const {
+    // The position of the thread in the sequence.
+    row = this->row_ + this->row_loop_step_ + mi * Mma_tile::M_PER_MMA_PER_CTA;
+    // The position inside the MMA.
+    row += ii * 8;
+
+    // The position of the thread in the sequence.
+    col = this->col_ + this->col_loop_step_ * Cta_tile::N + ni * Mma_tile::N_PER_MMA_PER_CTA;
+    // The position inside the MMA.
+    col += (jj & 0x02) * 4 + (jj & 0x1);
+  }
+
+  // Is a given position valid?
+  inline __device__ bool is_valid(int mi, int ni, int ii, int jj) const {
+    int row, col;
+    get_row_col(row, col, mi, ni, ii, jj);
+
+    // Is it a valid position in the sequence?
+    return is_valid(row, col);
+  }
+
+  // Is a given position valid?
+  inline __device__ bool is_valid(int row, int col) const {
+    // Is it a valid position in the sequence, i.e. are we in the lower triangle?
+    return (row >= col);
+  }
+
+  // GPT Mask: if lower left is invalid, none are valid
+  inline __device__ bool any_valid(int mi, int ni) const { return is_valid(mi, ni, 1, 0); }
+
+  // Load the mask... we use it to keep track of to row.
+  inline __device__ void load(int row_loop_step) { row_loop_step_ = row_loop_step; }
+
+  // Load the mask... we use it to keep track of to row, col (flash attention).
+  inline __device__ void load(int row_loop_step, int col_loop_step) {
+    row_loop_step_ = row_loop_step;
+    this->col_loop_step_ = col_loop_step;
+  }
+
+  // The upper-most position of the thread in the sequence.
+  int row_;
+  // Current row step offset.
+  int row_loop_step_;
+};
+
+// Specialized mask for MTP (multi-token prediction used in MLA).
+template <typename Traits, typename Cta_tile>
+struct MtpMask : public Mask<Traits, Cta_tile, 2> {
+  // MTP mask (causal mask) extends from V2 (dense) masks (self-attention).
+  using Base = Mask<Traits, Cta_tile, 2>;
+
+  // The shape of the MMA tile.
+  using Mma_tile = typename Base::Mma_tile;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ MtpMask(Params const& params, Block_info const& block_info, int tidx)
+      : Base(params, block_info, tidx),
+        num_grouped_heads_(params.num_grouped_heads),
+        row_loop_step_(0) {
+    // Update the seqlen (excluding all MTP draft tokens).
+    this->seqlen_ = this->seqlen_ - (block_info.actual_q_seqlen / params.num_grouped_heads) + 1;
+
+    int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+    // The position of the warp.
+    int warp_m = warp % Cta_tile::WARPS_M;
+    row_ = warp_m * 16 + lane / 4;
+  }
+
+  inline __device__ int get_row(int mi, int ii) const {
+    // The position of the thread in the sequence.
+    int row = this->row_ + this->row_loop_step_ + mi * Mma_tile::M_PER_MMA_PER_CTA;
+    // The position inside the MMA.
+    row += ii * 8;
+    return row;
+  }
+
+  inline __device__ int get_col(int ni, int jj) const {
+    // The position of the thread in the sequence.
+    int col = this->col_ + this->col_loop_step_ * Cta_tile::N + ni * Mma_tile::N_PER_MMA_PER_CTA;
+    // The position inside the MMA.
+    col += (jj & 0x02) * 4 + (jj & 0x1);
+    return col;
+  }
+
+  inline __device__ void get_row_col(int& row, int& col, int mi, int ni, int ii, int jj) const {
+    row = get_row(mi, ii);
+    col = get_col(ni, jj);
+  }
+
+  // Is a given position valid?
+  inline __device__ bool is_valid(int mi, int ni, int ii, int jj) const {
+    int col = get_col(ni, jj);
+
+    // Is it a valid position in the sequence?
+    return col < (this->seqlen_ + mtp_token_idx_[mi][ii]);
+  }
+
+  // Is a given position valid?
+  inline __device__ bool is_valid(int row, int col) const {
+    // Is it a valid position in the sequence, i.e. are we in the lower triangle?
+    return (row >= col);
+  }
+
+  // Load the mask... we use it to keep track of to row.
+  inline __device__ void load(int row_loop_step) {
+    row_loop_step_ = row_loop_step;
+// Update the MTP token index.
+#pragma unroll
+    for (int mi = 0; mi < Mma_tile::MMAS_M; ++mi) {
+#pragma unroll
+      for (int ii = 0; ii < 2; ++ii) {
+        mtp_token_idx_[mi][ii] = get_row(mi, ii) / num_grouped_heads_;
+      }
+    }
+  }
+
+  // The number of grouped heads in the row dimension.
+  int num_grouped_heads_;
+  // The corresponding MTP token index for each row.
+  // FIXME: currently we assume 2 rows per thread (volta/hopper-gmma traits are not supported yet).
+  int mtp_token_idx_[Mma_tile::MMAS_M][2];
+  // The upper-most position of the thread in the sequence.
+  int row_;
+  // The current row step offset.
+  int row_loop_step_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The lower triangle attention matrix.
+// Assume we only pay attention to past sliding-window-size long sequence.
+// v x x x x x x x x
+// v v x x x x x x x
+// v v v x x x x x x
+// v v v v x x x x x
+// v v v v v x x x x
+// x v v v v v x x x
+// x x v v v v v x x
+// x x x v v v v v x
+// x x x x v v v v v
+
+template <typename Traits, typename Cta_tile>
+struct Mask<Traits, Cta_tile, 4> : public Mask<Traits, Cta_tile, 3> {
+  // V4 mask is the causal mask (e.g. for GPT) plus the sliding-window feature.
+  using Base = Mask<Traits, Cta_tile, 3>;
+
+  // The shape of the MMA tile.
+  using Mma_tile = typename Base::Mma_tile;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Mask(Params const& params, Block_info const& block_info, int tidx)
+      : Base(params, block_info, tidx), sliding_window_size_(params.sliding_window_size) {}
+
+  // Is a given position valid?
+  inline __device__ bool is_valid(int mi, int ni, int ii, int jj) const {
+    int row, col;
+    this->get_row_col(row, col, mi, ni, ii, jj);
+
+    // Is it a valid position in the sequence?
+    return is_valid(row, col);
+  }
+
+  // Is a given position valid?
+  inline __device__ bool is_valid(int row, int col) const {
+    // Is it a valid position in the sequence, i.e. are we in the lower triangle?
+    return (row >= col) && (col >= max(0, row + 1 - sliding_window_size_));
+  }
+
+  // The sliding window size.
+  int sliding_window_size_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The custom mask (from global memory).
+template <typename Traits, typename Cta_tile>
+struct Mask<Traits, Cta_tile, 5> : public Mask<Traits, Cta_tile, 3> {
+  using Base = Mask<Traits, Cta_tile, 3>;
+
+  // The shape of the MMA tile.
+  using Mma_tile = typename Base::Mma_tile;
+
+  // The number of MMAs in each dimension.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  enum { MMAS_N = Mma_tile::MMAS_N };
+
+  // One 32-bit packed mask holds 4 MMAS_N as one group.
+  enum { MMA_GROUPS_N = fmha::Div_up<MMAS_N, 4>::VALUE };
+
+  // The MMAS_N in the group.
+  enum { MMAS_N_IN_GROUP = fmha::Min<MMAS_N, 4>::VALUE };
+
+  // MMAS_N uses full 32-bit integer packed masks.
+  enum { FULL_PACKED_MASK = (MMAS_N % 4 == 0) };
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Mask(Params const& params, Block_info const& block_info, int tidx)
+      : Base(params, block_info, tidx),
+        packed_mask_ptr_(reinterpret_cast<char const*>(params.packed_mask_ptr)),
+        params_packed_mask_stride_in_bytes_(params.packed_mask_stride_in_bytes),
+        row_offset_(0) {
+    // Add the thread offset in bytes.
+    packed_mask_ptr_ +=
+        (block_info.sum_mask_row * params_packed_mask_stride_in_bytes_ + tidx * sizeof(uint32_t));
+  }
+
+  // Load the mask... we use it to keep track of row offset.
+  inline __device__ void load(int row_offset) { row_offset_ = row_offset; }
+
+  // Load the mask into registers (and expand).
+  inline __device__ void load_mask(int col_offset) {
+    // The packed_mask_offset in the col(N) dimension.
+    int mask_col_offset = int(col_offset / (Mma_tile::N_PER_MMA_PER_CTA * 4)) *
+                          Cta_tile::THREADS_PER_CTA * sizeof(uint32_t);
+    // When MMAS_N < 4, one loaded packed_mask can be expanded to boolean masks
+    // of multiple iterations.
+    int local_col = FULL_PACKED_MASK ? 0 : (col_offset % (Mma_tile::N_PER_MMA_PER_CTA * 4));
+    // The local mma ni if MMAS_N < 4.
+    int local_ni = local_col / 16;
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // The M dimension offset.
+      int offset =
+          (row_offset_ + mi * Mma_tile::M_PER_MMA_PER_CTA) * params_packed_mask_stride_in_bytes_;
+      // The N dimension offset.
+      offset += mask_col_offset;
+      // Set predicate to true only when next 32-bit packed mask is needed.
+      bool pred = local_col == 0;
+#pragma unroll
+      for (int ni = 0; ni < MMA_GROUPS_N; ++ni) {
+        // The MMAS_N group offset.
+        if (pred) {
+          fmha::ldg(packed_mask_[mi][ni],
+                    packed_mask_ptr_ + offset + ni * Cta_tile::THREADS_PER_CTA * sizeof(uint32_t));
+        }
+      }
+    }
+
+// Expand the mask.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMA_GROUPS_N; ++ni) {
+#pragma unroll
+        for (int nni = 0; nni < MMAS_N_IN_GROUP; ++nni) {
+          mask_[2 * mi + 0][(ni * 4 + nni) * 4 + 0] =
+              packed_mask_[mi][ni] & (1u << (8 * (nni + local_ni) + 0));
+          mask_[2 * mi + 0][(ni * 4 + nni) * 4 + 1] =
+              packed_mask_[mi][ni] & (1u << (8 * (nni + local_ni) + 1));
+          mask_[2 * mi + 1][(ni * 4 + nni) * 4 + 0] =
+              packed_mask_[mi][ni] & (1u << (8 * (nni + local_ni) + 2));
+          mask_[2 * mi + 1][(ni * 4 + nni) * 4 + 1] =
+              packed_mask_[mi][ni] & (1u << (8 * (nni + local_ni) + 3));
+          mask_[2 * mi + 0][(ni * 4 + nni) * 4 + 2] =
+              packed_mask_[mi][ni] & (1u << (8 * (nni + local_ni) + 4));
+          mask_[2 * mi + 0][(ni * 4 + nni) * 4 + 3] =
+              packed_mask_[mi][ni] & (1u << (8 * (nni + local_ni) + 5));
+          mask_[2 * mi + 1][(ni * 4 + nni) * 4 + 2] =
+              packed_mask_[mi][ni] & (1u << (8 * (nni + local_ni) + 6));
+          mask_[2 * mi + 1][(ni * 4 + nni) * 4 + 3] =
+              packed_mask_[mi][ni] & (1u << (8 * (nni + local_ni) + 7));
+        }
+      }
+    }
+  }
+
+  // Move mask the col by offset (flash attention)
+  inline __device__ void move_to_offset(int col_offset) { load_mask(col_offset); }
+
+  // Is a given position valid?
+  inline __device__ bool is_valid(int mi, int ni, int ii, int jj) const {
+    return mask_[mi * 2 + ii][ni * 4 + jj];
+  }
+
+  // Current row step offset.
+  int row_offset_;
+
+  // The pointer to the mask.
+  char const* packed_mask_ptr_;
+  // The stride in the n dimension.
+  int64_t const params_packed_mask_stride_in_bytes_;
+  // The packed mask (one 32-bit integer per MMA GROUP, MMAS_M * 2 rows, MMA_GROUPS_N * 16 cols).
+  uint32_t packed_mask_[MMAS_M][MMA_GROUPS_N];
+  // The mask after expansion.
+  bool mask_[MMAS_M * 2][MMAS_N * 4];
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Mask<Volta_hmma_fp16_traits, Cta_tile, 2> {
+  // The instruction traits.
+  using Traits = Volta_hmma_fp16_traits;
+  // The shape of the MMA tile.
+  using Mma_tile = typename Traits::Mma_tile<Cta_tile>;
+
+  // That implementation works only when WARPS_K is 1.
+  static_assert(Cta_tile::WARPS_K == 1, "");
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Mask(Params const& params, Block_info const& block_info, int tidx)
+      : seqlen_(block_info.actual_seqlen) {
+    // The decomposition of the thread index into warp/lane.
+    int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+    // The position of the warp.
+    int warp_n = warp / Cta_tile::WARPS_M;
+    // The position of the thread.
+    col_ = block_info.bidn * Cta_tile::N + warp_n * 16 + (lane & 0x08) / 2;
+    col_init_ = col_;
+  }
+
+  // Is a given position valid?
+  inline __device__ bool is_valid(int, int ni, int, int jj) const {
+    // The position of the thread in the sequence.
+    int offset = this->col_ + ni * Mma_tile::N_PER_MMA_PER_CTA;
+    // The position inside the MMA.
+    offset += (jj & 0x04) * 2 + (jj & 0x03);
+    // Is it a valid position in the sequence?
+    return offset < seqlen_;
+  }
+
+  // Load the mask... Nothing to do for real.
+  inline __device__ void load(int) {}
+
+  // Reset mask to the initial col
+  inline __device__ void reset() { col_ = col_init_; }
+
+  // Move mask to next tile (flash attention)
+  inline __device__ void move() { this->col_ += Cta_tile::N; }
+
+  // Move mask the col by offset (flash attention)
+  inline __device__ void move_to_offset(int offset) { this->col_ = col_init_ + offset; }
+
+  // The length of the sequence.
+  int const seqlen_;
+  // The left-most position of the thread in the sequence.
+  int col_, col_init_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Mask<Volta_hmma_fp16_traits, Cta_tile, 3>
+    : public Mask<Volta_hmma_fp16_traits, Cta_tile, 2> {
+  // V3 mask is the causal mask (e.g. for GPT) and extends V2 masks (self-attention).
+  using Base = Mask<Volta_hmma_fp16_traits, Cta_tile, 2>;
+
+  // The shape of the MMA tile.
+  using Mma_tile = typename Base::Mma_tile;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Mask(Params const& params, Block_info const& block_info, int tidx)
+      : Base(params, block_info, tidx), loop_step_(0) {
+    int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+    // The position of the warp.
+    int warp_m = warp % Cta_tile::WARPS_M;
+    row_ = warp_m * 16 + (lane & 0x07) + (lane & 0x10) / 2;
+  }
+
+  inline __device__ void get_row_col(int& row, int& col, int mi, int ni, int ii, int jj) const {
+    // The position of the thread in the sequence.
+    row = this->row_ + this->loop_step_ + mi * Mma_tile::M_PER_MMA_PER_CTA;
+
+    // The position of the thread in the sequence.
+    col = this->col_ + ni * Mma_tile::N_PER_MMA_PER_CTA;
+    // The position inside the MMA.
+    col += (jj & 0x04) * 2 + (jj & 0x03);
+  }
+
+  // Is a given position valid?
+  inline __device__ bool is_valid(int mi, int ni, int ii, int jj) const {
+    int row, col;
+    get_row_col(row, col, mi, ni, ii, jj);
+
+    // Is it a valid position in the sequence?
+    return is_valid(row, col);
+  }
+
+  // Is a given position valid?
+  inline __device__ bool is_valid(int row, int col) const {
+    // Is it a valid position in the sequence, i.e. are we in the lower triangle?
+    return (row >= col) && (col < this->seqlen_);
+  }
+
+  // GPT Mask: if lower left is invalid, none are valid
+  inline __device__ bool any_valid(int mi, int ni) const { return is_valid(mi, ni, 0, 0); }
+
+  // Load the mask... we use it to keep track of to row.
+  inline __device__ void load(int loop_step) { loop_step_ = loop_step; }
+
+  // The upper-most position of the thread in the sequence.
+  int row_;
+  // Current iteration.
+  int loop_step_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int FMHA_VERSION, bool IS_MTP>
+struct Mask_dispatcher {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int FMHA_VERSION>
+struct Mask_dispatcher<Traits, Cta_tile, FMHA_VERSION, false>
+    : public Mask<Traits, Cta_tile, FMHA_VERSION> {
+  using Base = Mask<Traits, Cta_tile, FMHA_VERSION>;
+
+  template <typename Params, typename Block_info>
+  inline __device__ Mask_dispatcher(Params const& params, Block_info const& block_info, int tidx)
+      : Base(params, block_info, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int FMHA_VERSION>
+struct Mask_dispatcher<Traits, Cta_tile, FMHA_VERSION, true> : public MtpMask<Traits, Cta_tile> {
+  using Base = MtpMask<Traits, Cta_tile>;
+
+  template <typename Params, typename Block_info>
+  inline __device__ Mask_dispatcher(Params const& params, Block_info const& block_info, int tidx)
+      : Base(params, block_info, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int FMHA_VERSION>
+struct Mask_hopper {
+  // The shape of the MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Mask_hopper(Params const& params, Block_info const& block_info, int tidx)
+      : seqlen_(block_info.actual_seqlen) {
+    // For Hopper the warp distribution is always 4x1 within a warpgroup.
+    // So maybe there is some assumptions/optimizations to be made here.
+
+    int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int warp_n = warp / 4;
+    int lane = tidx % Cta_tile::THREADS_PER_WARP;
+    col_ = warp_n * Mma_tile::N_PER_WARP_GROUP + (lane % 4) * 2;
+  }
+
+  // Is a given position valid?
+  inline __device__ bool is_valid(int, int ni, int, int jj) const {
+    // The position of the thread in the sequence.
+    int offset = this->col_ + ni * Mma_tile::N_PER_MMA;
+    // The position inside the MMA.
+    offset += (jj / 2) * 8 + (jj % 2);
+    // Is it a valid position in the sequence?
+    return offset < seqlen_;
+  }
+
+  // Load the mask... Nothing to do for real.
+  inline __device__ void load(int) {}
+
+  // The length of the sequence.
+  int const seqlen_;
+  // The left-most position of the thread in the sequence.
+  int col_;
+};
+
+template <typename Traits, typename Cta_tile>
+struct Mask_hopper<Traits, Cta_tile, 3> {
+  // The shape of the MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Mask_hopper(Params const& params, Block_info const& block_info, int tidx) {
+    // For Hopper the warp distribution is always 4x1 within a warpgroup.
+    // So maybe there is some assumptions/optimizations to be made here.
+
+    int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int warp_n = warp / 4;
+    int warp_m = warp % 4;
+    int lane = tidx % Cta_tile::THREADS_PER_WARP;
+    col_ = warp_n * Mma_tile::N_PER_WARP_GROUP + (lane % 4) * 2;
+    row_base_ = warp_m * 16 + lane / 4;
+    row_ = row_base_;
+  }
+
+  inline __device__ void get_row_col(int& row, int& col, int mi, int ni, int ii, int jj) const {
+    // The row position of the thread in the sequence.
+    row = row_ + mi * Mma_tile::M_PER_MMA + ii * 8;
+
+    // The position of the thread in the sequence.
+    col = this->col_ + ni * Mma_tile::N_PER_MMA;
+    // The position inside the MMA.
+    col += (jj / 2) * 8 + (jj % 2);
+  }
+
+  // Is a given position valid?
+  inline __device__ bool is_valid(int mi, int ni, int ii, int jj) const {
+    int row, col;
+    get_row_col(row, col, mi, ni, ii, jj);
+
+    // Is it a valid position in the sequence?
+    return is_valid(row, col);
+  }
+
+  // Is a given position valid?
+  inline __device__ bool is_valid(int row, int col) const {
+    // Is it a valid position in the sequence?
+    return col <= row;
+  }
+
+  // Load the mask... Nothing to do for real.
+  inline __device__ void load(int loop_step) { row_ = row_base_ + loop_step * Cta_tile::M; }
+
+  // The left-most position of the thread in the sequence.
+  int row_, row_base_, col_;
+};
+
+template <typename Traits, typename Cta_tile>
+struct Mask_hopper<Traits, Cta_tile, 4> : public Mask_hopper<Traits, Cta_tile, 3> {
+  // V4 mask is the causal mask (e.g. for GPT) plus the sliding-window feature.
+  using Base = Mask_hopper<Traits, Cta_tile, 3>;
+
+  // The shape of the MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Mask_hopper(Params const& params, Block_info const& block_info, int tidx)
+      : Base(params, block_info, tidx), sliding_window_size_(params.sliding_window_size) {}
+
+  // Is a given position valid?
+  inline __device__ bool is_valid(int mi, int ni, int ii, int jj) const {
+    int row, col;
+    this->get_row_col(row, col, mi, ni, ii, jj);
+
+    // Is it a valid position in the sequence?
+    return is_valid(row, col);
+  }
+
+  // Is a given position valid?
+  inline __device__ bool is_valid(int row, int col) const {
+    // Is it a valid position in the sequence?
+    return col <= row && col >= max(0, row + 1 - sliding_window_size_);
+  }
+
+  // The sliding window size for attention.
+  int sliding_window_size_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/numeric_types.h b/csrc/fmha_v2/fmha/numeric_types.h
new file mode 100644
index 0000000000..1c3ec1a615
--- /dev/null
+++ b/csrc/fmha_v2/fmha/numeric_types.h
@@ -0,0 +1,57 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <cuda_runtime_api.h>
+
+#include <cstdint>
+
+#pragma once
+
+#if CUDART_VERSION >= 11080
+// TODO Better way?
+#define FMHA_CUDA_SUPPORTS_FP8 true
+#endif
+#include <cuda_bf16.h>
+#if FMHA_CUDA_SUPPORTS_FP8
+#include <cuda_fp8.h>
+#endif
+namespace fmha {
+
+using fp16_t = uint16_t;
+using fp32_t = float;
+using tf32_t = uint32_t;
+using bf16_t = nv_bfloat16;
+#if FMHA_CUDA_SUPPORTS_FP8
+using e4m3_t = __nv_fp8_e4m3;
+using e5m2_t = __nv_fp8_e5m2;
+#else
+using e4m3_t = char;
+using e5m2_t = char;
+#endif
+
+static constexpr float MAX_E4M3 = 448.f;    // 0x7E 2^8  * 1.75
+static constexpr float MAX_E5M2 = 57344.f;  // 0x7B 2^15 * 1.75
+
+template <typename T>
+__host__ __device__ constexpr inline float Softmax_fp_quant_scale();
+
+template <>
+__host__ __device__ constexpr inline float Softmax_fp_quant_scale<e4m3_t>() {
+  // Softmax has max output of 1.0, therefore we choose fp32-to-fp8 quantization scale as the
+  // largest power-of-2 below the e4m3 limit:
+  // 2^(floor(log2(E4M3_MAX / amax_exp_p))) = 2^(floor(log2(448 / 1))) = 2 ^ 8
+  return 256.f;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/paged_kv_cache.h b/csrc/fmha_v2/fmha/paged_kv_cache.h
new file mode 100644
index 0000000000..a8e13a61d0
--- /dev/null
+++ b/csrc/fmha_v2/fmha/paged_kv_cache.h
@@ -0,0 +1,63 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <math.h>
+
+namespace fmha {
+
+// This needs to be aligned with the definition in TRT-LLM
+struct Kv_block_array {
+  using PtrType = int32_t;
+
+  // Maximum number of sequences supported by the kv-cache.
+  int32_t mMaxSeqs;
+  // Max number of blocks per sequence
+  int32_t mMaxBlocksPerSeq;
+  // Number of tokens. It must be power of 2.
+  int32_t mTokensPerBlock;
+  // Exponent of number of tokens with base 2.
+  // E.g. for mTokensPerBlock 64, mTokensPerBlockLog2 equals to 6
+  int32_t mTokensPerBlockLog2;
+  // Table maps logical block idx to the data pointer of k/v cache block pool
+  // Shape [B, W, 2, M], where 2 is table for K and V,
+  // B is current number of sequences
+  // W is beam width
+  // M is Max number of blocks per sequence
+
+  // Size of KV cache blocks in bytes (H*D*T*sizeof(DataType))
+  int32_t mBytesPerBlock;
+  // Pointer to beginning of pool.
+  void* mPoolPtr;
+  // Pointer to block offsets.
+  PtrType* mBlockOffsets;
+
+  Kv_block_array() = default;
+
+  Kv_block_array(int32_t batchSize, int32_t maxBlocksPerSeq, int32_t tokensPerBlock,
+                 int32_t bytesPerBlock, void* poolPtr)
+      : mMaxSeqs(batchSize),
+        mMaxBlocksPerSeq(maxBlocksPerSeq),
+        mTokensPerBlock(tokensPerBlock),
+        mBytesPerBlock{bytesPerBlock},
+        mPoolPtr{poolPtr},
+        mBlockOffsets{nullptr} {
+    float const tokensPerBlockSeqLog2 = log2(mTokensPerBlock);
+    mTokensPerBlockLog2 = static_cast<int>(tokensPerBlockSeqLog2);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/smem_tile.h b/csrc/fmha_v2/fmha/smem_tile.h
new file mode 100644
index 0000000000..dd75cf7bdb
--- /dev/null
+++ b/csrc/fmha_v2/fmha/smem_tile.h
@@ -0,0 +1,2071 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/fragment.h>
+#include <fmha/traits.h>
+#include <fmha/utils.h>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The description of the tile computed by this CTA.
+    typename Cta_tile,
+    // The number of rows in the 2D shared memory buffer.
+    int M_,
+    // The number of cols.
+    int N_,
+    // The size in bits of each element.
+    int BITS_PER_ELEMENT_,
+    // The number of bytes per STS.
+    int BYTES_PER_STS_ = 16,
+    // The number of buffers. (Used in multistage and double buffer cases.)
+    int BUFFERS_PER_TILE_ = 1,
+    // Do we enable the fast path for LDS.128 and friends.
+    int ENABLE_LDS_FAST_PATH_ = 0,
+    // The number of rows that are used for the XOR swizzling to allow fast STS/LDS.
+    int ROWS_PER_XOR_PATTERN_ = 8,
+    // The number of cols that are used for the XOR swizzling to allow fast STS/LDS.
+    int COLS_PER_XOR_PATTERN_ = 1,
+    // Use or not predicates
+    bool USE_PREDICATES_ = true,
+    // Use TMA or not,
+    bool USE_TMA_ = false,
+    // The leading dim elements in shared memory
+    int LEAD_DIM_ELEMENTS_ = N_>
+struct Smem_tile_without_skews {
+  // The type of this tile
+  using Smem_tile_ =
+      Smem_tile_without_skews<Cta_tile, M_, N_, BITS_PER_ELEMENT_, BYTES_PER_STS_,
+                              BUFFERS_PER_TILE_, ENABLE_LDS_FAST_PATH_, ROWS_PER_XOR_PATTERN_,
+                              COLS_PER_XOR_PATTERN_, USE_PREDICATES_>;
+
+  static constexpr bool USE_TMA = USE_TMA_;
+
+  // The size in bits of each element.
+  enum { BITS_PER_ELEMENT = BITS_PER_ELEMENT_ };
+
+  // The size in bytes of a single STS.
+  enum { BYTES_PER_STS = BYTES_PER_STS_ };
+
+  // The number of elements per STS.
+  enum { ELEMENTS_PER_STS = BYTES_PER_STS * 8 / BITS_PER_ELEMENT };
+
+  // To support arbitrary N, we pad some values to a power-of-2.
+  enum { N_WITH_PADDING = Next_power_of_two<LEAD_DIM_ELEMENTS_>::VALUE };
+
+  // The number of bytes per row without packing of rows.
+  enum { BYTES_PER_ROW_BEFORE_PACKING = N_WITH_PADDING * BITS_PER_ELEMENT / 8 };
+
+  // The number of bytes per row -- we want at least 128B per row.
+  enum { BYTES_PER_ROW = Max<BYTES_PER_ROW_BEFORE_PACKING, 128>::VALUE };
+
+  // The number of rows in shared memory (two rows may be packed into a single one).
+  enum { ROWS = M_ * N_ / LEAD_DIM_ELEMENTS_ * BYTES_PER_ROW_BEFORE_PACKING / BYTES_PER_ROW };
+
+  // The number of threads per row.
+  enum { THREADS_PER_ROW_UNBOUNDED = BYTES_PER_ROW / BYTES_PER_STS };
+
+  // The number of threads per row.
+  enum { THREADS_PER_ROW = Min<Cta_tile::THREADS_PER_CTA, THREADS_PER_ROW_UNBOUNDED>::VALUE };
+
+  // The number of STS per row.
+  enum { STS_PER_ROW = BYTES_PER_ROW / THREADS_PER_ROW / BYTES_PER_STS };
+
+  // It must be at least one.
+  static_assert(STS_PER_ROW >= 1, "");
+
+  // The number of rows written with a single STS.
+  enum { ROWS_PER_STS = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // Make sure we write to at least one row per STS. Thanks Dr. Obvious ;)
+  static_assert(ROWS_PER_STS >= 1, "");
+
+  // The number of STS needed to store all rows.
+  enum { STS_PER_COL = Div_up<ROWS, ROWS_PER_STS>::VALUE };
+
+  // The number of STS in total.
+  enum { STS = STS_PER_COL * STS_PER_ROW };
+
+  // The size of one buffer in bytes in shared memory.
+  enum { BYTES_PER_BUFFER = STS * BYTES_PER_STS * Cta_tile::THREADS_PER_CTA };
+
+  // The number of buffers.
+  enum { BUFFERS_PER_TILE = BUFFERS_PER_TILE_ };
+
+  // The size in bytes of total buffers.
+  enum { BYTES_PER_TILE = BYTES_PER_BUFFER * BUFFERS_PER_TILE };
+
+  // The boundary for smem_read_offset and smem_write_offset increment.
+  enum { BYTES_PER_TILE_INC_BOUNDARY = BYTES_PER_TILE - BYTES_PER_BUFFER };
+
+  // The number of rows that are used for the XOR swizzling to allow fast STS/LDS.
+  enum { ROWS_PER_XOR_PATTERN = ROWS_PER_XOR_PATTERN_ };
+
+  // The number of cols that are used for the XOR swizzling to allow fast STS/LDS.
+  enum { COLS_PER_XOR_PATTERN = COLS_PER_XOR_PATTERN_ * 16 / BYTES_PER_STS };
+
+  // Use or not predicates
+  enum { USE_PREDICATES = USE_PREDICATES_ };
+
+  // The bytes of one shmem row
+  enum { BYTES_PER_SHMEM_ROW = 128 };
+
+  // The type of elements that are stored in shared memory by each thread.
+  using Store_type = typename Uint_from_size_in_bytes<BYTES_PER_STS>::Type;
+
+  // Ctor.
+  inline __device__ Smem_tile_without_skews(void* smem, int tidx)
+      : smem_(__nvvm_get_smem_pointer(smem)) {
+    // The row written by a thread. See doc/mma_smem_layout.xlsx.
+    int smem_write_row = tidx / THREADS_PER_ROW;
+
+    // The XOR pattern.
+    int smem_write_xor = smem_write_row % ROWS_PER_XOR_PATTERN * COLS_PER_XOR_PATTERN;
+    // Compute the column and apply the XOR pattern.
+    int smem_write_col = (tidx % THREADS_PER_ROW) ^ smem_write_xor;
+
+    // The offset.
+    this->smem_write_offset_ = smem_write_row * BYTES_PER_ROW + smem_write_col * BYTES_PER_STS;
+
+    // That code is expected to trigger the utilization of the URF by the compiler.
+    this->smem_read_buffer_ = __shfl_sync(0xffffffff, 0, 0);
+    this->smem_write_buffer_ = __shfl_sync(0xffffffff, 0, 0);
+  }
+
+  // Compute the store pointers.
+  template <int N, int K = 1>
+  inline __device__ void compute_store_pointers(uint32_t (&ptrs)[N]) {
+#pragma unroll
+    for (int ii = 0; ii < N; ++ii) {
+      // Decompose the STS into row/col.
+      int row = ii % STS_PER_COL;
+      int col = ii / STS_PER_COL;
+
+      // Compute the immediate.
+      int imm = row;
+
+      // Assemble the offset.
+      int offset = smem_write_offset_ + imm * ROWS_PER_STS * BYTES_PER_ROW;
+
+      // Take the column into account.
+      if (STS_PER_ROW > 1) {
+        offset += col * THREADS_PER_ROW * BYTES_PER_STS;
+      }
+
+      // Apply the XOR pattern if needed.
+      if (ROWS_PER_STS < ROWS_PER_XOR_PATTERN) {
+        int const m = row * ROWS_PER_STS % ROWS_PER_XOR_PATTERN;
+        offset ^= m * COLS_PER_XOR_PATTERN * BYTES_PER_STS;
+      }
+
+// Assemble the final pointer :)
+#pragma unroll
+      for (int k = 0; k < K; k++) {
+        ptrs[ii * K + k] = smem_ + offset + k * (BYTES_PER_STS / K) + smem_write_buffer_;
+      }
+    }
+  }
+
+  inline __device__ void debug_reset() {
+    for (int buffer = 0; buffer < BYTES_PER_TILE; buffer += BYTES_PER_BUFFER) {
+      for (int row = 0; row < ROWS; ++row) {
+        for (int col = 0; col < BYTES_PER_ROW; col += 4) {
+          if (threadIdx.x == 0) {
+            uint32_t val = 0x0;
+            sts(val, smem_ + row * BYTES_PER_ROW + col + buffer);
+          }
+        }
+      }
+    }
+  }
+
+  // Print the content of the tile (only for debug ;)).
+  inline __device__ void debug_print() const {
+    for (int buffer = 0; buffer < BYTES_PER_TILE; buffer += BYTES_PER_BUFFER) {
+      for (int row = 0; row < ROWS; ++row) {
+        for (int col = 0; col < BYTES_PER_ROW; col += 4) {
+          if (threadIdx.x == 0) {
+            uint32_t val;
+            lds(val, smem_ + row * BYTES_PER_ROW + col + buffer);
+            printf(
+                "block=(x=%2d, y=%2d, z=%2d) (smem_=0x%08x, buffer=%2d, row=%2d, "
+                "byte=%4d)=0x%08x\n",
+                blockIdx.x, blockIdx.y, blockIdx.z, smem_, buffer, row, col, val);
+          }
+        }
+      }
+    }
+  }
+
+  // Move the read offset to next buffer.
+  inline __device__ void move_to_next_read_buffer() {
+    if (BUFFERS_PER_TILE > 1 && smem_read_buffer_ >= BYTES_PER_TILE_INC_BOUNDARY) {
+      this->smem_read_buffer_ -= BYTES_PER_TILE_INC_BOUNDARY;
+    } else if (BUFFERS_PER_TILE > 1) {
+      this->smem_read_buffer_ += BYTES_PER_BUFFER;
+    }
+  }
+
+  // Move the read offset to next buffer. TODO: Remove this member function!!!
+  inline __device__ void move_next_read_buffer() { this->move_to_next_read_buffer(); }
+
+  // Move the read offset to next N buffer (circular-buffer).
+  inline __device__ void move_to_next_read_buffer(int N) {
+    if (BUFFERS_PER_TILE > 1) {
+      this->smem_read_buffer_ += N * BYTES_PER_BUFFER;
+      this->smem_read_buffer_ -= smem_read_buffer_ >= BYTES_PER_TILE ? BYTES_PER_TILE : 0;
+    }
+  }
+
+  // Move the read offset to next N buffer (circular-buffer). TODO: Remove this member function!!!
+  inline __device__ void move_next_read_buffer(int N) { this->move_to_next_read_buffer(N); }
+
+  // Move the write offset to next buffer.
+  inline __device__ void move_to_next_write_buffer() {
+    if (BUFFERS_PER_TILE > 1 && smem_write_buffer_ >= BYTES_PER_TILE_INC_BOUNDARY) {
+      this->smem_write_buffer_ -= BYTES_PER_TILE_INC_BOUNDARY;
+    } else if (BUFFERS_PER_TILE > 1) {
+      this->smem_write_buffer_ += BYTES_PER_BUFFER;
+    }
+  }
+
+  // Move the write offset to next buffer. TODO: Remove that member function!
+  inline __device__ void move_next_write_buffer() { this->move_to_next_write_buffer(); }
+
+  // Move the read offset.
+  inline __device__ void move_read_offset(int delta) { this->smem_read_offset_ += delta; }
+
+  // Move the write offset.
+  inline __device__ void move_write_offset(int delta) { this->smem_write_offset_ += delta; }
+
+  // Store to the tile in shared memory.
+  template <int N>
+  inline __device__ void store(Store_type const (&data)[N]) {
+    uint32_t smem_ptrs[N];
+    this->compute_store_pointers(smem_ptrs);
+    sts(smem_ptrs, data);
+  }
+
+  // Store to the tile in shared memory.
+  template <int N, int M>
+  inline __device__ void store(Store_type const (&data)[N], uint32_t (&preds)[M]) {
+    uint32_t smem_ptrs[N];
+    this->compute_store_pointers(smem_ptrs);
+    sts(smem_ptrs, data, preds);
+  }
+
+  // Store to the tile in shared memory.
+  template <int N>
+  inline __device__ void store(Store_type const (&data)[N], uint32_t preds) {
+    this->store(data, preds);
+  }
+
+  // Store to the tile in shared memory. TODO: Remove last template arguments.
+  template <int N, int M>
+  inline __device__ void store(void const* (&gmem_ptrs)[N], uint32_t (&preds)[M]) {
+    uint32_t smem_ptrs[N];
+    this->compute_store_pointers<N>(smem_ptrs);
+    ldgsts<N, M>(smem_ptrs, gmem_ptrs, preds);
+  }
+
+  // Store to the tile in shared memory.
+  template <int N>
+  inline __device__ void store(void const* (&gmem_ptrs)[N], uint32_t preds, uint64_t = 0) {
+    uint32_t tmp[1] = {preds};
+    this->store(gmem_ptrs, tmp);
+  }
+
+  // Store to the tile in shared memory.
+  template <int N>
+  inline __device__ void store(void const* (&gmem_ptrs)[N], uint32_t preds) {
+    uint32_t tmp[1] = {preds};
+    this->store(gmem_ptrs, tmp);
+  }
+
+  inline __device__ void add_smem_barrier_base(uint64_t*) {}
+
+  // The shared memory pointer.
+  uint32_t smem_;
+  // The read offset. Reserve 4 offsets if needed.
+  int smem_read_offset_;
+  // The write offset.
+  int smem_write_offset_;
+  // The buffer base offset for read.
+  int smem_read_buffer_;
+  // The buffer base offset for write.
+  int smem_write_buffer_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Use TMA
+template <
+    // The description of the tile computed by this CTA.
+    typename Cta_tile,
+    // The number of rows in the 2D shared memory buffer.
+    int M_,
+    // The number of cols.
+    int N_,
+    // The size in bits of each element.
+    int BITS_PER_ELEMENT_,
+    // The number of bytes per STS. Not relevant for TMA
+    int BYTES_PER_STS_,
+    // The number of buffers. (Used in multistage and double buffer cases.)
+    int BUFFERS_PER_TILE_,
+    // Do we enable the fast path for LDS.128 and friends.
+    int ENABLE_LDS_FAST_PATH_,
+    // The number of rows that are used for the XOR swizzling to allow fast STS/LDS.
+    int ROWS_PER_XOR_PATTERN_,
+    // The number of cols that are used for the XOR swizzling to allow fast STS/LDS.
+    int COLS_PER_XOR_PATTERN_,
+    // Use or not predicates
+    bool USE_PREDICATES_,
+    // The leading dim elements in shared memory
+    int LEAD_DIM_ELEMENTS_>
+struct Smem_tile_without_skews<Cta_tile, M_, N_, BITS_PER_ELEMENT_, BYTES_PER_STS_,
+                               BUFFERS_PER_TILE_, ENABLE_LDS_FAST_PATH_, ROWS_PER_XOR_PATTERN_,
+                               COLS_PER_XOR_PATTERN_, USE_PREDICATES_, true, LEAD_DIM_ELEMENTS_>
+    : public Smem_tile_without_skews<Cta_tile, M_, N_, BITS_PER_ELEMENT_, BYTES_PER_STS_,
+                                     BUFFERS_PER_TILE_, ENABLE_LDS_FAST_PATH_,
+                                     ROWS_PER_XOR_PATTERN_, COLS_PER_XOR_PATTERN_, USE_PREDICATES_,
+                                     false, LEAD_DIM_ELEMENTS_> {
+  // Base struct
+  using Base =
+      Smem_tile_without_skews<Cta_tile, M_, N_, BITS_PER_ELEMENT_, BYTES_PER_STS_,
+                              BUFFERS_PER_TILE_, ENABLE_LDS_FAST_PATH_, ROWS_PER_XOR_PATTERN_,
+                              COLS_PER_XOR_PATTERN_, USE_PREDICATES_, false, LEAD_DIM_ELEMENTS_>;
+  static constexpr bool USE_TMA = true;
+
+  // Tile size overrides. STS per thread not relevant for TMA
+  static constexpr int BYTES_PER_BUFFER = M_ * N_ * Base::BITS_PER_ELEMENT / 8;
+  static constexpr int BYTES_PER_TILE = BYTES_PER_BUFFER * Base::BUFFERS_PER_TILE;
+  static constexpr int BYTES_PER_TILE_INC_BOUNDARY = BYTES_PER_TILE - BYTES_PER_BUFFER;
+  // The number of bytes per barrier
+  static constexpr int BYTES_PER_BARRIER = 8;
+
+  // Ctor
+  inline __device__ Smem_tile_without_skews(void* smem, int tidx) : Base(smem, tidx) {
+    this->smem_write_offset_ = __nvvm_get_smem_pointer(smem);
+    this->smem_barrier_offset_ = 0;
+    this->elect_one_ = elect_one_sync();
+  }
+
+  inline __device__ void add_smem_barrier_base(uint64_t* smem_barrier) {
+    this->smem_barrier_ = smem_barrier;
+    this->smem_barrier_offset_ = __nvvm_get_smem_pointer(this->smem_barrier_);
+  }
+
+  /**
+   * \brief load tensor blocks from global memory and stores to shared memory using tma instructions
+   *
+   * \param p_desc pointer to tma descriptor masked as const void* pointer
+   * \param smem_offset shared memory offset in bytes relative to smem_write_buffer_
+   * \param coord0 tensor access coordinate in dimension 1, used by tma load
+   * \param coord1 tensor access coordinate in dimension 2, used by tma load
+   * \param coord2 tensor access coordinate in dimension 3, used by tma load
+   * \param coord3 tensor access coordinate in dimension 4, used by tma load
+   * \param coord4 tensor access coordinate in dimension 5, used by tma load
+   * \param filter_offsets encodes multicast cta id and filter offsets
+   */
+  template <uint32_t DIM, cudaTmaDescType DESC_TYPE, unsigned COPY_BYTES,
+            bool USE_TMA_MULTICAST = false>
+  inline __device__ void store(void const* p_desc, unsigned const& smem_offset, int32_t coord0,
+                               int32_t coord1, int32_t coord2, int32_t coord3, int32_t coord4,
+                               uint16_t filter_offsets, uint16_t mcast_cta_mask,
+                               uint64_t mem_desc) {
+    uint32_t smem = this->smem_write_offset_ + smem_offset;
+    fmha::utmaldg<DIM, DESC_TYPE, USE_TMA_MULTICAST>(
+        reinterpret_cast<cudaTmaDesc const*>(p_desc), smem, unsigned(this->smem_barrier_offset_),
+        coord0, coord1, coord2, coord3, coord4, filter_offsets, mcast_cta_mask, mem_desc,
+        this->elect_one_);
+  }
+
+  // Same function as above but for runtime cga dimension
+  template <uint32_t DIM, cudaTmaDescType DESC_TYPE, unsigned COPY_BYTES>
+  inline __device__ void store(void const* p_desc, unsigned const& smem_offset, int32_t coord0,
+                               int32_t coord1, int32_t coord2, int32_t coord3, int32_t coord4,
+                               uint16_t filter_offsets, uint16_t mcast_cta_mask, uint64_t mem_desc,
+                               bool mcast_enabled) {
+    uint32_t smem = this->smem_write_offset_ + smem_offset;
+    fmha::utmaldg<DIM, DESC_TYPE>(reinterpret_cast<cudaTmaDesc const*>(p_desc), smem,
+                                  unsigned(this->smem_barrier_offset_), coord0, coord1, coord2,
+                                  coord3, coord4, filter_offsets, mcast_cta_mask, mcast_enabled,
+                                  mem_desc, this->elect_one_);
+  }
+
+  // Move the write offset to next buffer.
+  inline __device__ void move_next_write_buffer() {
+    if (Base::BUFFERS_PER_TILE > 1) {
+      this->smem_write_offset_ += (this->smem_write_offset_ >= BYTES_PER_TILE_INC_BOUNDARY)
+                                      ? -BYTES_PER_TILE_INC_BOUNDARY
+                                      : BYTES_PER_BUFFER;
+      this->smem_barrier_offset_ +=
+          (this->smem_barrier_offset_ >= Base::BUFFERS_PER_TILE * BYTES_PER_BARRIER)
+              ? -Base::BUFFERS_PER_TILE * BYTES_PER_BARRIER
+              : BYTES_PER_BARRIER;
+    }
+  }
+
+  inline __device__ void move_next_write_buffer(int buffer_id) {
+    if (Base::BUFFERS_PER_TILE > 1) {
+      this->smem_write_offset_ = this->smem_ + buffer_id * BYTES_PER_BUFFER;
+    }
+    this->smem_barrier_offset_ = __nvvm_get_smem_pointer(this->smem_barrier_ + buffer_id);
+  }
+
+  // Move the read offset to next buffer.
+  // do nothing, as it is controlled by gmma desc
+  inline __device__ void move_next_read_buffer() {}
+
+  uint64_t* smem_barrier_;
+  uint32_t smem_barrier_offset_;
+  // elect one thread to issue utmaldg
+  uint32_t elect_one_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The layout of the tile.
+    typename Layout,
+    // The size of the STS.
+    int BYTES_PER_STS = 16,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE = 1,
+    // Use or not predicates
+    bool USE_PREDICATES = true>
+struct Smem_tile_a {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, int N>
+struct Rows_per_xor_pattern_volta_a {
+  // The size in bits.
+  enum { N_IN_BITS = N * Traits::BITS_PER_ELEMENT_A };
+
+  // The number of rows.
+  enum { VALUE = N_IN_BITS <= 256 ? 1 : (N_IN_BITS <= 512 ? 2 : 4) };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int MMAS_K, int MMAS_K_WITH_PADDING>
+struct Compute_reset_mask {
+  // The potential mask.
+  enum { HALF = MMAS_K_WITH_PADDING / 2 };
+
+  // The remainder.
+  enum { MOD = MMAS_K % HALF };
+
+  // The final value.
+  enum { VALUE = (MMAS_K == MOD ? 0 : HALF) | Compute_reset_mask<MOD, HALF>::VALUE };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int MMAS_K_WITH_PADDING>
+struct Compute_reset_mask<0, MMAS_K_WITH_PADDING> {
+  enum { VALUE = 0 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int MMAS_K>
+struct Compute_reset_mask<MMAS_K, MMAS_K> {
+  enum { VALUE = MMAS_K - 1 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE,
+    // How many rows to use for the XOR pattern to avoid bank conflicts?
+    int ROWS_PER_XOR_PATTERN_ = Rows_per_xor_pattern_volta_a<Traits, Cta_tile::K>::VALUE>
+struct Smem_tile_volta_row_a
+    : public Smem_tile_without_skews<Cta_tile, Cta_tile::M, Cta_tile::K, 16, BYTES_PER_STS,
+                                     BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_, 1> {
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The base class.
+  using Base = Smem_tile_without_skews<Cta_tile, Cta_tile::M, Cta_tile::K, 16, BYTES_PER_STS,
+                                       BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_, 1>;
+  // The fragment.
+  using Fragment = Fragment_a<Traits, Row>;
+
+  // When we use padding to reach a power of two, special care has to be taken.
+  using Cta_tile_with_padding = Cta_tile_with_k_with_padding<Traits, Cta_tile>;
+  // The number of MMAs.
+  using Mma_tile_with_padding = typename Traits::template Mma_tile<Cta_tile_with_padding>;
+
+  // The size of a single LDS in bytes.
+  enum { BYTES_PER_LDS = 16 };
+
+  // Ctor.
+  inline __device__ Smem_tile_volta_row_a(void* smem, int tidx) : Base(smem, tidx) {
+    // For documentation on the layout, see doc/xmma_smem_layout.xlsx.
+
+    // The number of warps.
+    int const WARPS_M = Cta_tile::WARPS_M;
+    int const WARPS_N = Cta_tile::WARPS_N;
+    int const WARPS_K = Cta_tile::WARPS_K;
+
+    // The masks to select the warps.
+    int const WARP_MASK_M = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::M;
+    int const WARP_MASK_K = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::K;
+
+    // The divisor for the warps.
+    int const WARP_DIV_M = 1 * 1 * Cta_tile::THREADS_PER_WARP;
+    int const WARP_DIV_K = WARPS_M * WARPS_N * Cta_tile::THREADS_PER_WARP;
+
+    // The row and column read by the thread.
+    int smem_read_row, smem_read_col;
+    if (Base::N_WITH_PADDING >= 64) {
+      smem_read_row = (tidx & WARP_MASK_M) / WARP_DIV_M * Mma_tile::M_PER_MMA / 1 +
+                      (tidx & 0x10) / 2 + (tidx & 0x07);
+      smem_read_col = (tidx & 0x03);
+    } else if (Base::N_WITH_PADDING == 32) {
+      smem_read_row = (tidx & WARP_MASK_M) / WARP_DIV_M * Mma_tile::M_PER_MMA / 2 +
+                      (tidx & 0x10) / 4 + (tidx & 0x06) / 2;
+      smem_read_col = (tidx & 0x02) / 2 + (tidx & 0x01) * 4;
+    } else {
+      assert(false);
+    }
+
+    // For WARPS_K > 1, we do not support Base::N_WITH_PADDING < 64 for the moment.
+    static_assert(WARPS_K <= 2 && (WARPS_K == 1 || Base::N_WITH_PADDING >= 64), "");
+
+    // We "swap" the block for the second warp working on the in-CTA split-K.
+    if (WARPS_K == 2) {
+      smem_read_col ^= (tidx & WARP_MASK_K) / WARP_DIV_K * Mma_tile_with_padding::MMAS_K;
+    }
+
+    // The shared memory offset.
+    this->smem_read_offset_ = smem_read_row * Base::BYTES_PER_ROW + smem_read_col * BYTES_PER_LDS;
+  }
+
+  // Rewind smem_read_offset for last LDS phase in main loop.-
+  inline __device__ void reverse_smem_read_offset(int ki = 0) {
+    // Move the offset to the next position. See doc/xmma_smem_layout.xlsx.
+    this->smem_read_offset_ ^= ((ki % 2 == 0) ? 1 : 3) * BYTES_PER_LDS;
+  }
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&a)[Mma_tile::MMAS_M], int ki) {
+#pragma unroll
+    for (int mi = 0; mi < Mma_tile::MMAS_M; ++mi) {
+      // Jump over as many rows as needed.
+      int offset = mi * Mma_tile::M_PER_MMA_PER_CTA * Base::BYTES_PER_ROW_BEFORE_PACKING;
+
+      // TODO: Could we fuse smem_read_buffer and smem_read_offset?
+      uint4 tmp;
+      lds(tmp, this->smem_ + this->smem_read_offset_ + this->smem_read_buffer_ + offset);
+      a[mi].reg(0) = tmp.x;
+      a[mi].reg(1) = tmp.y;
+      a[mi].reg(2) = tmp.z;
+      a[mi].reg(3) = tmp.w;
+    }
+
+    // Move the offset to the next position. See doc/xmma_smem_layout.xlsx.
+    static_assert(Mma_tile_with_padding::MMAS_K < 64, "Not implemented");
+    if (Mma_tile_with_padding::MMAS_K >= 32 && ki % 16 == 15) {
+      this->smem_read_offset_ ^= 31 * BYTES_PER_LDS;
+    } else if (Mma_tile_with_padding::MMAS_K >= 16 && ki % 8 == 7) {
+      this->smem_read_offset_ ^= 15 * BYTES_PER_LDS;
+    } else if (Mma_tile_with_padding::MMAS_K >= 8 && ki % 4 == 3) {
+      this->smem_read_offset_ ^= 7 * BYTES_PER_LDS;
+    } else if (Mma_tile_with_padding::MMAS_K >= 4 && ki % 2 == 1) {
+      this->smem_read_offset_ ^= 3 * BYTES_PER_LDS;
+    } else if (Mma_tile_with_padding::MMAS_K >= 2) {
+      this->smem_read_offset_ ^= 1 * BYTES_PER_LDS;
+    }
+  }
+
+  // Reset the read offset.
+  inline __device__ void reset_read_offset() {
+    // The number of MMAs in the K dimension.
+    enum { MMAS_K = Mma_tile::MMAS_K };
+
+    // The number of MMAs in the K dimension when we include padding.
+    enum { MMAS_K_WITH_PADDING = Mma_tile_with_padding::MMAS_K };
+
+    // Assemble the mask.
+    enum { MASK = Compute_reset_mask<MMAS_K, MMAS_K_WITH_PADDING>::VALUE };
+
+    // Reset the read offset.
+    this->smem_read_offset_ ^= MASK * BYTES_PER_LDS;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_a<Volta_hmma_fp16_traits, Cta_tile, Row, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_volta_row_a<Volta_hmma_fp16_traits, Cta_tile, BYTES_PER_STS,
+                                   BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = fmha::Volta_hmma_fp16_traits;
+  // The base class.
+  using Base = Smem_tile_volta_row_a<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_a(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, int N>
+struct Rows_per_xor_pattern_turing_a {
+  // The size in bits.
+  enum { N_IN_BITS = N * Traits::BITS_PER_ELEMENT_A };
+
+  // The number of rows.
+  enum { VALUE = N_IN_BITS <= 128 ? 1 : (N_IN_BITS <= 256 ? 2 : (N_IN_BITS <= 512 ? 4 : 8)) };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE,
+    // How many rows to use for the XOR pattern to avoid bank conflicts?
+    int ROWS_PER_XOR_PATTERN_ = Rows_per_xor_pattern_turing_a<Traits, Cta_tile::K>::VALUE>
+struct Smem_tile_turing_row_a
+    : public Smem_tile_without_skews<Cta_tile, Cta_tile::M, Cta_tile::K, Traits::BITS_PER_ELEMENT_A,
+                                     BYTES_PER_STS, BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_, 1> {
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The base class.
+  using Base =
+      Smem_tile_without_skews<Cta_tile, Cta_tile::M, Cta_tile::K, Traits::BITS_PER_ELEMENT_A,
+                              BYTES_PER_STS, BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_, 1>;
+  // The fragment.
+  using Fragment = Fragment_a<Traits, Row>;
+
+  // When we use padding to reach a power of two, special care has to be taken.
+  using Cta_tile_with_padding = Cta_tile_with_k_with_padding<Traits, Cta_tile>;
+  // The number of MMAs.
+  using Mma_tile_with_padding = typename Traits::template Mma_tile<Cta_tile_with_padding>;
+
+  // The size of a single LDS in bytes.
+  enum { BYTES_PER_LDS = 16 };
+
+  // Ctor.
+  inline __device__ Smem_tile_turing_row_a(void* smem, int tidx) : Base(smem, tidx) {
+    // For documentation on the layout, see doc/mma_smem_layout.xlsx.
+
+    // The number of warps.
+    int const WARPS_M = Cta_tile::WARPS_M;
+    int const WARPS_N = Cta_tile::WARPS_N;
+    int const WARPS_K = Cta_tile::WARPS_K;
+
+    // The masks to select the warps.
+    int const WARP_MASK_M = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::M;
+    int const WARP_MASK_K = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::K;
+
+    // The divisor for the warps.
+    int const WARP_DIV_M = 1 * 1 * Cta_tile::THREADS_PER_WARP;
+    int const WARP_DIV_K = WARPS_M * WARPS_N * Cta_tile::THREADS_PER_WARP;
+
+    // The row and column read by the thread.
+    int smem_read_row, smem_read_col;
+
+    static_assert(Base::ROWS_PER_XOR_PATTERN == 8 || Base::ROWS_PER_XOR_PATTERN == 4 ||
+                      Base::ROWS_PER_XOR_PATTERN == 2 || Base::ROWS_PER_XOR_PATTERN == 1,
+                  "");
+
+    if (Base::ROWS_PER_XOR_PATTERN == 8) {
+      smem_read_row = (tidx & WARP_MASK_M) / WARP_DIV_M * Mma_tile::M_PER_MMA / 1 + (tidx & 0x0f);
+      smem_read_col = (tidx & 0x07);
+    } else if (Base::ROWS_PER_XOR_PATTERN == 4) {
+      smem_read_row =
+          (tidx & WARP_MASK_M) / WARP_DIV_M * Mma_tile::M_PER_MMA / 2 + (tidx & 0x0e) / 2;
+      smem_read_col = (tidx & 0x06) / 2 + (tidx & 0x01) * 4;
+    } else if (Base::ROWS_PER_XOR_PATTERN == 2) {
+      smem_read_row =
+          (tidx & WARP_MASK_M) / WARP_DIV_M * Mma_tile::M_PER_MMA / 4 + (tidx & 0x0c) / 4;
+      smem_read_col = (tidx & 0x04) / 4 + (tidx & 0x03) * 2;
+    } else if (Base::ROWS_PER_XOR_PATTERN == 1) {
+      smem_read_row =
+          (tidx & WARP_MASK_M) / WARP_DIV_M * Mma_tile::M_PER_MMA / 8 + (tidx & 0x1f) / 8;
+      smem_read_col = (tidx & 0x07);
+    }
+
+    static_assert(WARPS_K <= 2, "");
+
+    // We "swap" the block for the second warp working on the in-CTA split-K.
+    if (WARPS_K == 2) {
+      smem_read_col ^= (tidx & WARP_MASK_K) / WARP_DIV_K * Mma_tile_with_padding::MMAS_K;
+    }
+
+    // The shared memory offset.
+    this->smem_read_offset_ = smem_read_row * Base::BYTES_PER_ROW + smem_read_col * BYTES_PER_LDS;
+  }
+
+  // Rewind smem_read_offset for last LDS phase in main loop.-
+  inline __device__ void reverse_smem_read_offset(int ki = 0) {
+    // Move the offset to the next position. See doc/mma_smem_layout.xlsx.
+    this->smem_read_offset_ ^= ((ki % 2 == 0) ? 1 : 3) * BYTES_PER_LDS;
+  }
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&a)[Mma_tile::MMAS_M], int ki) {
+#pragma unroll
+    for (int mi = 0; mi < Mma_tile::MMAS_M; ++mi) {
+      int offset = mi * Mma_tile::M_PER_MMA_PER_CTA * Base::BYTES_PER_ROW_BEFORE_PACKING;
+      uint2 tmp;
+      ldsm(tmp, this->smem_ + this->smem_read_offset_ + this->smem_read_buffer_ + offset);
+      a[mi].reg(0) = tmp.x;
+      a[mi].reg(1) = tmp.y;
+    }
+
+    // Move the offset to the next position. See doc/mma_smem_layout.xlsx.
+    static_assert(Mma_tile_with_padding::MMAS_K < 64, "Not implemented");
+    if (Mma_tile_with_padding::MMAS_K >= 32 && ki % 16 == 15) {
+      this->smem_read_offset_ ^= 31 * BYTES_PER_LDS;
+    } else if (Mma_tile_with_padding::MMAS_K >= 16 && ki % 8 == 7) {
+      this->smem_read_offset_ ^= 15 * BYTES_PER_LDS;
+    } else if (Mma_tile_with_padding::MMAS_K >= 8 && ki % 4 == 3) {
+      this->smem_read_offset_ ^= 7 * BYTES_PER_LDS;
+    } else if (Mma_tile_with_padding::MMAS_K >= 4 && ki % 2 == 1) {
+      this->smem_read_offset_ ^= 3 * BYTES_PER_LDS;
+    } else if (Mma_tile_with_padding::MMAS_K >= 2) {
+      this->smem_read_offset_ ^= 1 * BYTES_PER_LDS;
+    }
+  }
+
+  // Reset the read offset.
+  inline __device__ void reset_read_offset() {
+    // The number of MMAs in the K dimension.
+    enum { MMAS_K = Mma_tile::MMAS_K };
+
+    // The number of MMAs in the K dimension when we include padding.
+    enum { MMAS_K_WITH_PADDING = Mma_tile_with_padding::MMAS_K };
+
+    // Assemble the mask.
+    enum { MASK = Compute_reset_mask<MMAS_K, MMAS_K_WITH_PADDING>::VALUE };
+
+    // Reset the read offset.
+    this->smem_read_offset_ ^= MASK * BYTES_PER_LDS;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_a<Turing_hmma_fp16_traits, Cta_tile, Row, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_turing_row_a<Turing_hmma_fp16_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Turing_hmma_fp16_traits;
+  // The base class.
+  using Base = Smem_tile_turing_row_a<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_a(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_a<Turing_hmma_fp32_traits, Cta_tile, Row, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_turing_row_a<Turing_hmma_fp32_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Turing_hmma_fp32_traits;
+  // The base class.
+  using Base = Smem_tile_turing_row_a<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_a(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_a<Turing_imma_int8_int32_traits, Cta_tile, Row, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_turing_row_a<Turing_imma_int8_int32_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Turing_imma_int8_int32_traits;
+  // The base class.
+  using Base = Smem_tile_turing_row_a<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_a(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, int N>
+struct Rows_per_xor_pattern_ampere_a {
+  // The size in bits.
+  enum { N_IN_BITS = N * Traits::BITS_PER_ELEMENT_A };
+
+  // The number of rows.
+  enum { VALUE = N_IN_BITS <= 256 ? 2 : (N_IN_BITS <= 512 ? 4 : 8) };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, int N>
+struct Rows_per_xor_pattern_ampere_row_a : public Rows_per_xor_pattern_ampere_a<Traits, N> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE,
+    // How many rows to use for the XOR pattern to avoid bank conflicts?
+    int ROWS_PER_XOR_PATTERN_ = Rows_per_xor_pattern_ampere_row_a<Traits, Cta_tile::K>::VALUE>
+struct Smem_tile_ampere_row_a
+    : public Smem_tile_without_skews<Cta_tile, Cta_tile::M, Cta_tile::K, Traits::BITS_PER_ELEMENT_A,
+                                     BYTES_PER_STS, BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_, 1> {
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The base class.
+  using Base =
+      Smem_tile_without_skews<Cta_tile, Cta_tile::M, Cta_tile::K, Traits::BITS_PER_ELEMENT_A,
+                              BYTES_PER_STS, BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_, 1>;
+  // The fragment.
+  using Fragment = Fragment_a<Traits, Row>;
+
+  // When we use padding to reach a power of two, special care has to be taken.
+  using Cta_tile_with_padding = Cta_tile_with_k_with_padding<Traits, Cta_tile>;
+  // The number of MMAs.
+  using Mma_tile_with_padding = typename Traits::template Mma_tile<Cta_tile_with_padding>;
+
+  // The size of a single LDS in bytes.
+  enum { BYTES_PER_LDS = 16 };
+
+  // Ctor.
+  inline __device__ Smem_tile_ampere_row_a(void* smem, int tidx) : Base(smem, tidx) {
+    // For documentation on the layout, see doc/mma_smem_layout.xlsx.
+
+    // The number of warps.
+    int const WARPS_M = Cta_tile::WARPS_M;
+    int const WARPS_N = Cta_tile::WARPS_N;
+    int const WARPS_K = Cta_tile::WARPS_K;
+
+    // The masks to select the warps.
+    int const WARP_MASK_M = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::M;
+    int const WARP_MASK_K = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::K;
+
+    // The divisor for the warps.
+    int const WARP_DIV_M = 1 * 1 * Cta_tile::THREADS_PER_WARP;
+    int const WARP_DIV_K = WARPS_M * WARPS_N * Cta_tile::THREADS_PER_WARP;
+
+    // The row and column read by the thread.
+    int smem_read_row, smem_read_col;
+
+    static_assert(Base::ROWS_PER_XOR_PATTERN == 8 || Base::ROWS_PER_XOR_PATTERN == 4 ||
+                      Base::ROWS_PER_XOR_PATTERN == 2,
+                  "");
+
+    if (Base::ROWS_PER_XOR_PATTERN == 8) {
+      smem_read_row = (tidx & WARP_MASK_M) / WARP_DIV_M * Mma_tile::M_PER_MMA / 1 + (tidx & 0x0f);
+      smem_read_col = (tidx & 0x07);
+      smem_read_col ^= (tidx & 0x10) / 16;
+    } else if (Base::ROWS_PER_XOR_PATTERN == 4) {
+      smem_read_row =
+          (tidx & WARP_MASK_M) / WARP_DIV_M * Mma_tile::M_PER_MMA / 2 + (tidx & 0x0e) / 2;
+      smem_read_col = (tidx & 0x06) / 2 + (tidx & 0x01) * 4;
+      smem_read_col ^= (tidx & 0x10) / 16;
+    } else if (Base::ROWS_PER_XOR_PATTERN == 2) {
+      smem_read_row =
+          (tidx & WARP_MASK_M) / WARP_DIV_M * Mma_tile::M_PER_MMA / 4 + (tidx & 0x0c) / 4;
+      smem_read_col = (tidx & 0x04) / 4 + (tidx & 0x03) * 2;
+      smem_read_col ^= (tidx & 0x10) / 16;
+    }
+
+    static_assert(WARPS_K <= 2, "");
+    static_assert(WARPS_K != 2 || Base::ROWS_PER_XOR_PATTERN != 2, "");
+
+    // We "swap" the block for the second warp working on the same outputs in-CTA split-K.
+    if (WARPS_K == 2) {
+      smem_read_col ^= (tidx & WARP_MASK_K) / WARP_DIV_K * Mma_tile_with_padding::MMAS_K * 2;
+    }
+
+    // The shared memory offset.
+    this->smem_read_offset_ = smem_read_row * Base::BYTES_PER_ROW + smem_read_col * BYTES_PER_LDS;
+  }
+
+  // Rewind smem_read_offset for last LDS phase in main loop.
+  inline __device__ void reverse_smem_read_offset(int ki = 0) {
+    // Undo the pointer increment for the next ni.
+    // Should match the load function below for ki = 0.
+    if (Mma_tile_with_padding::MMAS_K >= 2) {
+      this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
+    }
+  }
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&a)[Mma_tile::MMAS_M], int ki) {
+    if (ki < Mma_tile::VALID_MMAS_K) {
+#pragma unroll
+      for (int mi = 0; mi < Mma_tile::MMAS_M; ++mi) {
+        // Jump by as many matrix rows as needed (a row in smem may pack multiple matrix rows).
+        int offset = mi * Mma_tile::M_PER_MMA_PER_CTA * Base::BYTES_PER_ROW_BEFORE_PACKING;
+
+        // Load using LDSM.M88.4.
+        uint4 tmp;
+        ldsm(tmp, this->smem_ + this->smem_read_offset_ + this->smem_read_buffer_ + offset);
+
+        // Store the value into the fragment.
+        a[mi].reg(0) = tmp.x;
+        a[mi].reg(1) = tmp.y;
+        a[mi].reg(2) = tmp.z;
+        a[mi].reg(3) = tmp.w;
+      }
+    }
+
+    // Move the offset to the next position. See doc/mma_smem_layout.xlsx.
+    static_assert(Mma_tile_with_padding::MMAS_K < 64, "Not implemented");
+    if (Mma_tile_with_padding::MMAS_K >= 32 && ki % 16 == 15) {
+      this->smem_read_offset_ ^= 31 * BYTES_PER_LDS * 2;
+    } else if (Mma_tile_with_padding::MMAS_K >= 16 && ki % 8 == 7) {
+      this->smem_read_offset_ ^= 15 * BYTES_PER_LDS * 2;
+    } else if (Mma_tile_with_padding::MMAS_K >= 8 && ki % 4 == 3) {
+      this->smem_read_offset_ ^= 7 * BYTES_PER_LDS * 2;
+    } else if (Mma_tile_with_padding::MMAS_K >= 4 && ki % 2 == 1) {
+      this->smem_read_offset_ ^= 3 * BYTES_PER_LDS * 2;
+    } else if (Mma_tile_with_padding::MMAS_K >= 2) {
+      this->smem_read_offset_ ^= 1 * BYTES_PER_LDS * 2;
+    }
+  }
+
+  // Reset the read offset.
+  inline __device__ void reset_read_offset() {
+    // The number of MMAs in the K dimension.
+    enum { MMAS_K = Mma_tile::MMAS_K };
+
+    // The number of MMAs in the K dimension when we include padding.
+    enum { MMAS_K_WITH_PADDING = Mma_tile_with_padding::MMAS_K };
+
+    // Assemble the mask.
+    enum { MASK = Compute_reset_mask<MMAS_K, MMAS_K_WITH_PADDING>::VALUE };
+
+    // Reset the read offset.
+    this->smem_read_offset_ ^= MASK * BYTES_PER_LDS * 2;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_a<Ampere_hmma_fp16_traits, Cta_tile, Row, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_ampere_row_a<Ampere_hmma_fp16_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Ampere_hmma_fp16_traits;
+  // The base class.
+  using Base = Smem_tile_ampere_row_a<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_a(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_a<Ampere_hmma_fp32_traits, Cta_tile, Row, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_ampere_row_a<Ampere_hmma_fp32_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Ampere_hmma_fp32_traits;
+  // The base class.
+  using Base = Smem_tile_ampere_row_a<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_a(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_a<Ampere_hmma_bf16_traits, Cta_tile, Row, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_ampere_row_a<Ampere_hmma_bf16_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Ampere_hmma_bf16_traits;
+  // The base class.
+  using Base = Smem_tile_ampere_row_a<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_a(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_a<Ampere_imma_int8_int32_traits, Cta_tile, Row, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_ampere_row_a<Ampere_imma_int8_int32_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Ampere_imma_int8_int32_traits;
+  // The base class.
+  using Base = Smem_tile_ampere_row_a<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_a(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_a<Ada_qmma_e4m3_fp32_traits, Cta_tile, Row, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_ampere_row_a<Ada_qmma_e4m3_fp32_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Ada_qmma_e4m3_fp32_traits;
+  // The base class.
+  using Base = Smem_tile_ampere_row_a<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_a(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_a<Ada_qmma_e4m3_fp16_traits, Cta_tile, Row, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_ampere_row_a<Ada_qmma_e4m3_fp16_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Ada_qmma_e4m3_fp16_traits;
+  // The base class.
+  using Base = Smem_tile_ampere_row_a<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_a(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The layout of the tile.
+    typename Layout,
+    // The size of the STS.
+    int BYTES_PER_STS = 16,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE = 1,
+    // Use or not predicates
+    bool USE_PREDICATES = true>
+struct Smem_tile_b {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, int N>
+struct Rows_per_xor_pattern_volta_b {
+  // The size in bits.
+  enum { N_IN_BITS = N * Traits::BITS_PER_ELEMENT_B };
+
+  // The number of rows.
+  enum { VALUE = N_IN_BITS <= 256 ? 1 : (N_IN_BITS <= 512 ? 2 : 4) };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE,
+    // How many rows to use for the XOR pattern to avoid bank conflicts?
+    int ROWS_PER_XOR_PATTERN_ = Rows_per_xor_pattern_volta_b<Traits, Cta_tile::K>::VALUE>
+struct Smem_tile_volta_col_b
+    : public Smem_tile_without_skews<Cta_tile, Cta_tile::N, Cta_tile::K, 16, BYTES_PER_STS,
+                                     BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_, 1> {
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The base class.
+  using Base = Smem_tile_without_skews<Cta_tile, Cta_tile::N, Cta_tile::K, 16, BYTES_PER_STS,
+                                       BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_, 1>;
+
+  // When we use padding to reach a power of two, special care has to be taken.
+  using Cta_tile_with_padding = Cta_tile_with_k_with_padding<Traits, Cta_tile>;
+  // The number of MMAs.
+  using Mma_tile_with_padding = typename Traits::template Mma_tile<Cta_tile_with_padding>;
+  // The fragment.
+  using Fragment = Fragment_b<Traits, Col>;
+
+  // The size of a single LDS in bytes.
+  enum { BYTES_PER_LDS = 16 };
+
+  // Ctor.
+  inline __device__ Smem_tile_volta_col_b(void* smem, int tidx) : Base(smem, tidx) {
+    // For documentation on the layout, see doc/xmma_smem_layout.xlsx.
+
+    // The number of warps.
+    int const WARPS_M = Cta_tile::WARPS_M;
+    int const WARPS_N = Cta_tile::WARPS_N;
+    int const WARPS_K = Cta_tile::WARPS_K;
+
+    // The masks to select the warps.
+    int const WARP_MASK_N = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::N;
+    int const WARP_MASK_K = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::K;
+
+    // The divisor for the warps.
+    int const WARP_DIV_N = WARPS_M * 1 * Cta_tile::THREADS_PER_WARP;
+    int const WARP_DIV_K = WARPS_M * WARPS_N * Cta_tile::THREADS_PER_WARP;
+
+    // The row and column read by the thread.
+    int smem_read_row, smem_read_col;
+
+    if (Base::N_WITH_PADDING >= 64) {
+      smem_read_row = (tidx & WARP_MASK_N) / WARP_DIV_N * Mma_tile::N_PER_MMA / 1 +
+                      (tidx & 0x18) / 2 + (tidx & 0x03);
+      smem_read_col = (tidx & 0x03);
+    } else if (Base::N_WITH_PADDING == 32) {
+      smem_read_row = (tidx & WARP_MASK_N) / WARP_DIV_N * Mma_tile::N_PER_MMA / 2 +
+                      (tidx & 0x18) / 4 + (tidx & 0x02) / 2;
+      smem_read_col = (tidx & 0x02) / 2 + (tidx & 0x01) * 4;
+    } else {
+      assert(false);
+    }
+
+    // For WARPS_K > 1, we do not support Base::N_WITH_PADDING < 64 for the moment.
+    static_assert(WARPS_K <= 2 && (WARPS_K == 1 || Base::N_WITH_PADDING >= 64), "");
+
+    // We "swap" the block for the second warp working on the in-CTA split-K.
+    if (WARPS_K == 2) {
+      smem_read_col ^= (tidx & WARP_MASK_K) / WARP_DIV_K * Mma_tile_with_padding::MMAS_K;
+    }
+
+    // The shared memory offset.
+    this->smem_read_offset_ = smem_read_row * Base::BYTES_PER_ROW + smem_read_col * BYTES_PER_LDS;
+  }
+
+  // Rewind smem_read_offset for last LDS phase in main loop.-
+  inline __device__ void reverse_smem_read_offset(int ki = 0) {
+    // Move the offset to the next position. See doc/xmma_smem_layout.xlsx.
+    this->smem_read_offset_ ^= ((ki % 2 == 0) ? 1 : 3) * BYTES_PER_LDS;
+  }
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&b)[Mma_tile::MMAS_N], int ki) {
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      // Jump over as many rows as needed.
+      int offset = ni * Mma_tile::N_PER_MMA_PER_CTA * Base::BYTES_PER_ROW_BEFORE_PACKING;
+
+      // TODO: Can we fuse read_offset and read_buffer?
+      uint4 tmp;
+      lds(tmp, this->smem_ + this->smem_read_offset_ + this->smem_read_buffer_ + offset);
+      b[ni].reg(0) = tmp.x;
+      b[ni].reg(1) = tmp.y;
+      b[ni].reg(2) = tmp.z;
+      b[ni].reg(3) = tmp.w;
+    }
+
+    // Move the offset to the next position. See doc/xmma_smem_layout.xlsx.
+    static_assert(Mma_tile_with_padding::MMAS_K < 64, "Not implemented");
+    if (Mma_tile_with_padding::MMAS_K >= 32 && ki % 16 == 15) {
+      this->smem_read_offset_ ^= 31 * BYTES_PER_LDS;
+    } else if (Mma_tile_with_padding::MMAS_K >= 16 && ki % 8 == 7) {
+      this->smem_read_offset_ ^= 15 * BYTES_PER_LDS;
+    } else if (Mma_tile_with_padding::MMAS_K >= 8 && ki % 4 == 3) {
+      this->smem_read_offset_ ^= 7 * BYTES_PER_LDS;
+    } else if (Mma_tile_with_padding::MMAS_K >= 4 && ki % 2 == 1) {
+      this->smem_read_offset_ ^= 3 * BYTES_PER_LDS;
+    } else if (Mma_tile_with_padding::MMAS_K >= 2) {
+      this->smem_read_offset_ ^= 1 * BYTES_PER_LDS;
+    }
+  }
+
+  // Reset the read offset.
+  inline __device__ void reset_read_offset() {
+    // The number of MMAs in the K dimension.
+    enum { MMAS_K = Mma_tile::MMAS_K };
+
+    // The number of MMAs in the K dimension when we include padding.
+    enum { MMAS_K_WITH_PADDING = Mma_tile_with_padding::MMAS_K };
+
+    // Assemble the mask.
+    enum { MASK = Compute_reset_mask<MMAS_K, MMAS_K_WITH_PADDING>::VALUE };
+
+    // Reset the read offset.
+    this->smem_read_offset_ ^= MASK * BYTES_PER_LDS;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_b<Volta_hmma_fp16_traits, Cta_tile, Col, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_volta_col_b<Volta_hmma_fp16_traits, Cta_tile, BYTES_PER_STS,
+                                   BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = fmha::Volta_hmma_fp16_traits;
+  // The base class.
+  using Base = Smem_tile_volta_col_b<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_b(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, int N>
+struct Rows_per_xor_pattern_turing_b {
+  // The size in bits.
+  enum { N_IN_BITS = N * Traits::BITS_PER_ELEMENT_B };
+
+  // The number of rows.
+  enum { VALUE = N_IN_BITS <= 128 ? 1 : (N_IN_BITS <= 256 ? 2 : (N_IN_BITS <= 512 ? 4 : 8)) };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE,
+    // How many rows to use for the XOR pattern to avoid bank conflicts?
+    int ROWS_PER_XOR_PATTERN_ = Rows_per_xor_pattern_turing_b<Traits, Cta_tile::K>::VALUE>
+struct Smem_tile_turing_col_b
+    : public Smem_tile_without_skews<Cta_tile, Cta_tile::N, Cta_tile::K, Traits::BITS_PER_ELEMENT_B,
+                                     BYTES_PER_STS, BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_, 1> {
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The base class.
+  using Base =
+      Smem_tile_without_skews<Cta_tile, Cta_tile::N, Cta_tile::K, Traits::BITS_PER_ELEMENT_B,
+                              BYTES_PER_STS, BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_, 1>;
+  // The fragment.
+  using Fragment = Fragment_b<Traits, Col>;
+
+  // When we use padding to reach a power of two, special care has to be taken.
+  using Cta_tile_with_padding = Cta_tile_with_k_with_padding<Traits, Cta_tile>;
+  // The number of MMAs.
+  using Mma_tile_with_padding = typename Traits::template Mma_tile<Cta_tile_with_padding>;
+
+  // The size of a single LDS in bytes.
+  enum { BYTES_PER_LDS = 16 };
+
+  // Ctor.
+  inline __device__ Smem_tile_turing_col_b(void* smem, int tidx) : Base(smem, tidx) {
+    // For documentation on the layout, see doc/mma_smem_layout.xlsx.
+
+    // The number of warps.
+    int const WARPS_M = Cta_tile::WARPS_M;
+    int const WARPS_N = Cta_tile::WARPS_N;
+    int const WARPS_K = Cta_tile::WARPS_K;
+
+    // The masks to select the warps.
+    int const WARP_MASK_N = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::N;
+    int const WARP_MASK_K = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::K;
+
+    // The divisor for the warps.
+    int const WARP_DIV_N = WARPS_M * 1 * Cta_tile::THREADS_PER_WARP;
+    int const WARP_DIV_K = WARPS_M * WARPS_N * Cta_tile::THREADS_PER_WARP;
+
+    // The row and column read by the thread.
+    int smem_read_row, smem_read_col;
+
+    static_assert(Base::ROWS_PER_XOR_PATTERN == 8 || Base::ROWS_PER_XOR_PATTERN == 4 ||
+                      Base::ROWS_PER_XOR_PATTERN == 2 || Base::ROWS_PER_XOR_PATTERN == 1,
+                  "");
+
+    if (Base::ROWS_PER_XOR_PATTERN == 8) {
+      // For group fprop. B is divided into 2 halves along N dimension.
+      // The fist warp takes the first half and the second warp takes the second half.
+      smem_read_row = (tidx & WARP_MASK_N) / WARP_DIV_N * Mma_tile::N_PER_MMA / 1 + (tidx & 0x0f);
+      smem_read_col = (tidx & 0x07);
+    } else if (Base::ROWS_PER_XOR_PATTERN == 4) {
+      smem_read_row =
+          (tidx & WARP_MASK_N) / WARP_DIV_N * Mma_tile::N_PER_MMA / 2 + (tidx & 0x0e) / 2;
+      smem_read_col = (tidx & 0x06) / 2 + (tidx & 0x01) * 4;
+    } else if (Base::ROWS_PER_XOR_PATTERN == 2) {
+      smem_read_row =
+          (tidx & WARP_MASK_N) / WARP_DIV_N * Mma_tile::N_PER_MMA / 4 + (tidx & 0x0c) / 4;
+      smem_read_col = (tidx & 0x04) / 4 + (tidx & 0x03) * 2;
+    } else if (Base::ROWS_PER_XOR_PATTERN == 1) {
+      smem_read_row =
+          (tidx & WARP_MASK_N) / WARP_DIV_N * Mma_tile::N_PER_MMA / 8 + (tidx & 0x1f) / 8;
+      smem_read_col = (tidx & 0x07);
+    }
+
+    static_assert(WARPS_K <= 2, "");
+
+    // We "swap" the block for the second warp working on the in-CTA split-K.
+    if (WARPS_K == 2) {
+      smem_read_col ^= (tidx & WARP_MASK_K) / WARP_DIV_K * Mma_tile_with_padding::MMAS_K;
+    }
+
+    // The shared memory offset.
+    this->smem_read_offset_ = smem_read_row * Base::BYTES_PER_ROW + smem_read_col * BYTES_PER_LDS;
+  }
+
+  // Rewind smem_read_offset for last LDS phase in main loop.-
+  inline __device__ void reverse_smem_read_offset(int ki = 0) {
+    // Move the offset to the next position. See doc/mma_smem_layout.xlsx.
+    this->smem_read_offset_ ^= ((ki % 2 == 0) ? 1 : 3) * BYTES_PER_LDS;
+  }
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&b)[Mma_tile::MMAS_N], int ki) {
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      int offset = ni * Mma_tile::N_PER_MMA_PER_CTA * Base::BYTES_PER_ROW_BEFORE_PACKING;
+      uint2 tmp;
+      ldsm(tmp, this->smem_ + this->smem_read_offset_ + this->smem_read_buffer_ + offset);
+      b[ni].reg(0) = tmp.x;
+      b[ni].reg(1) = tmp.y;
+    }
+    // Move the offset to the next position. See doc/mma_smem_layout.xlsx.
+    static_assert(Mma_tile_with_padding::MMAS_K < 64, "Not implemented");
+    if (Mma_tile_with_padding::MMAS_K >= 32 && ki % 16 == 15) {
+      this->smem_read_offset_ ^= 31 * BYTES_PER_LDS;
+    } else if (Mma_tile_with_padding::MMAS_K >= 16 && ki % 8 == 7) {
+      this->smem_read_offset_ ^= 15 * BYTES_PER_LDS;
+    } else if (Mma_tile_with_padding::MMAS_K >= 8 && ki % 4 == 3) {
+      this->smem_read_offset_ ^= 7 * BYTES_PER_LDS;
+    } else if (Mma_tile_with_padding::MMAS_K >= 4 && ki % 2 == 1) {
+      this->smem_read_offset_ ^= 3 * BYTES_PER_LDS;
+    } else if (Mma_tile_with_padding::MMAS_K >= 2) {
+      this->smem_read_offset_ ^= 1 * BYTES_PER_LDS;
+    }
+  }
+
+  // Reset the read offset.
+  inline __device__ void reset_read_offset() {
+    // The number of MMAs in the K dimension.
+    enum { MMAS_K = Mma_tile::MMAS_K };
+
+    // The number of MMAs in the K dimension when we include padding.
+    enum { MMAS_K_WITH_PADDING = Mma_tile_with_padding::MMAS_K };
+
+    // Assemble the mask.
+    enum { MASK = Compute_reset_mask<MMAS_K, MMAS_K_WITH_PADDING>::VALUE };
+
+    // Reset the read offset.
+    this->smem_read_offset_ ^= MASK * BYTES_PER_LDS;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_b<Turing_hmma_fp16_traits, Cta_tile, Col, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_turing_col_b<Turing_hmma_fp16_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Turing_hmma_fp16_traits;
+  // The base class.
+  using Base = Smem_tile_turing_col_b<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_b(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_b<Turing_hmma_fp32_traits, Cta_tile, Col, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_turing_col_b<Turing_hmma_fp32_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Turing_hmma_fp32_traits;
+  // The base class.
+  using Base = Smem_tile_turing_col_b<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_b(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_b<Turing_imma_int8_int32_traits, Cta_tile, Col, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_turing_col_b<Turing_imma_int8_int32_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Turing_imma_int8_int32_traits;
+  // The base class.
+  using Base = Smem_tile_turing_col_b<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_b(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, int N>
+struct Rows_per_xor_pattern_ampere_b {
+  // The size in bits.
+  enum { N_IN_BITS = N * Traits::BITS_PER_ELEMENT_B };
+
+  // The number of rows.
+  enum { VALUE = N_IN_BITS <= 256 ? 2 : (N_IN_BITS <= 512 ? 4 : 8) };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, int N>
+struct Rows_per_xor_pattern_ampere_col_b : public Rows_per_xor_pattern_ampere_b<Traits, N> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE,
+    // How many rows to use for the XOR pattern to avoid bank conflicts?
+    int ROWS_PER_XOR_PATTERN_ = Rows_per_xor_pattern_ampere_col_b<Traits, Cta_tile::K>::VALUE>
+struct Smem_tile_ampere_col_b
+    : public Smem_tile_without_skews<Cta_tile, Cta_tile::N, Cta_tile::K, Traits::BITS_PER_ELEMENT_B,
+                                     BYTES_PER_STS, BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_, 1> {
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The base class.
+  using Base =
+      Smem_tile_without_skews<Cta_tile, Cta_tile::N, Cta_tile::K, Traits::BITS_PER_ELEMENT_B,
+                              BYTES_PER_STS, BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_, 1>;
+  // The fragment.
+  using Fragment = Fragment_b<Traits, Col>;
+
+  // When we use padding to reach a power of two, special care has to be taken.
+  using Cta_tile_with_padding = Cta_tile_with_k_with_padding<Traits, Cta_tile>;
+  // The number of MMAs.
+  using Mma_tile_with_padding = typename Traits::template Mma_tile<Cta_tile_with_padding>;
+
+  // The size of a single LDS in bytes.
+  enum { BYTES_PER_LDS = 16 };
+
+  // The number of STS per thread
+  enum { STS_PER_THREAD_ = Base::ROWS * Base::THREADS_PER_ROW / Cta_tile::THREADS_PER_CTA };
+
+  // The number of STS per thread must be at least 1.
+  enum { STS_PER_THREAD = Max<1, STS_PER_THREAD_>::VALUE };
+
+  // Ctor.
+  inline __device__ Smem_tile_ampere_col_b(void* smem, int tidx) : Base(smem, tidx) {
+    // For documentation on the layout, see doc/mma_smem_layout.xlsx.
+
+    // The number of warps.
+    int const WARPS_M = Cta_tile::WARPS_M;
+    int const WARPS_N = Cta_tile::WARPS_N;
+    int const WARPS_K = Cta_tile::WARPS_K;
+
+    // The masks to select the warps.
+    int const WARP_MASK_N = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::N;
+    int const WARP_MASK_K = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::K;
+
+    // The divisor for the warps.
+    int const WARP_DIV_N = WARPS_M * 1 * Cta_tile::THREADS_PER_WARP;
+    int const WARP_DIV_K = WARPS_M * WARPS_N * Cta_tile::THREADS_PER_WARP;
+
+    // The row and column read by the thread.
+    int smem_read_row, smem_read_col;
+
+    static_assert(Base::ROWS_PER_XOR_PATTERN == 8 || Base::ROWS_PER_XOR_PATTERN == 4 ||
+                      Base::ROWS_PER_XOR_PATTERN == 2,
+                  "");
+
+    if (Base::ROWS_PER_XOR_PATTERN == 8) {
+      // For group fprop. B is divided into 2 halves along N dimension.
+      // The fist warp takes the first half and the second warp takes the second half.
+      smem_read_row = (tidx & WARP_MASK_N) / WARP_DIV_N * Mma_tile::N_PER_MMA / 1 + (tidx & 0x07) +
+                      (tidx & 0x10) / 2;
+      smem_read_col = (tidx & 0x07);
+      smem_read_col ^= (tidx & 0x08) / 8;
+    } else if (Base::ROWS_PER_XOR_PATTERN == 4) {
+      smem_read_row = (tidx & WARP_MASK_N) / WARP_DIV_N * Mma_tile::N_PER_MMA / 2 +
+                      (tidx & 0x06) / 2 + (tidx & 0x10) / 4;
+      smem_read_col = (tidx & 0x06) / 2 + (tidx & 0x01) * 4;
+      smem_read_col ^= (tidx & 0x08) / 8;
+    } else if (Base::ROWS_PER_XOR_PATTERN == 2) {
+      smem_read_row = (tidx & WARP_MASK_N) / WARP_DIV_N * Mma_tile::N_PER_MMA / 4 +
+                      (tidx & 0x04) / 4 + (tidx & 0x10) / 8;
+      smem_read_col = (tidx & 0x04) / 4 + (tidx & 0x03) * 2;
+      smem_read_col ^= (tidx & 0x08) / 8;
+    }
+
+    static_assert(WARPS_K <= 2, "");
+    static_assert(WARPS_K != 2 || Base::ROWS_PER_XOR_PATTERN != 2, "");
+
+    // We "swap" the block for the second warp working on the in-CTA split-K.
+    if (WARPS_K == 2) {
+      smem_read_col ^= (tidx & WARP_MASK_K) / WARP_DIV_K * Mma_tile_with_padding::MMAS_K * 2;
+    }
+
+    // The shared memory offset.
+    this->smem_read_offset_ = smem_read_row * Base::BYTES_PER_ROW + smem_read_col * BYTES_PER_LDS;
+  }
+
+  // Rewind smem_read_offset for last LDS phase in main loop.
+  inline __device__ void reverse_smem_read_offset(int ki = 0) {
+    // Undo the pointer increment for the next ni.
+    // Should match the load function below for ki = 0.
+    if (Mma_tile_with_padding::MMAS_K >= 2) {
+      this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
+    }
+  }
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&b)[Mma_tile::MMAS_N], int ki) {
+    if (ki < Mma_tile::VALID_MMAS_K) {
+#pragma unroll
+      for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+        // Jump by as many matrix rows as needed (a row in smem may pack multiple matrix rows).
+        int offset = ni * Mma_tile::N_PER_MMA_PER_CTA * Base::BYTES_PER_ROW_BEFORE_PACKING;
+
+        // Load using LDSM.M88.4.
+        uint4 tmp;
+        ldsm(tmp, this->smem_ + this->smem_read_offset_ + this->smem_read_buffer_ + offset);
+
+        // Store the value into the fragment.
+        b[ni].reg(0) = tmp.x;
+        b[ni].reg(1) = tmp.y;
+        b[ni].reg(2) = tmp.z;
+        b[ni].reg(3) = tmp.w;
+      }
+    }
+
+    // Move the offset to the next position. See doc/mma_smem_layout.xlsx.
+    static_assert(Mma_tile_with_padding::MMAS_K < 64, "Not implemented");
+    if (Mma_tile_with_padding::MMAS_K >= 32 && ki % 16 == 15) {
+      this->smem_read_offset_ ^= 31 * BYTES_PER_LDS * 2;
+    } else if (Mma_tile_with_padding::MMAS_K >= 16 && ki % 8 == 7) {
+      this->smem_read_offset_ ^= 15 * BYTES_PER_LDS * 2;
+    } else if (Mma_tile_with_padding::MMAS_K >= 8 && ki % 4 == 3) {
+      this->smem_read_offset_ ^= 7 * BYTES_PER_LDS * 2;
+    } else if (Mma_tile_with_padding::MMAS_K >= 4 && ki % 2 == 1) {
+      this->smem_read_offset_ ^= 3 * BYTES_PER_LDS * 2;
+    } else if (Mma_tile_with_padding::MMAS_K >= 2) {
+      this->smem_read_offset_ ^= 1 * BYTES_PER_LDS * 2;
+    }
+  }
+
+  // Reset the read offset.
+  inline __device__ void reset_read_offset() {
+    // The number of MMAs in the K dimension.
+    enum { MMAS_K = Mma_tile::MMAS_K };
+
+    // The number of MMAs in the K dimension when we include padding.
+    enum { MMAS_K_WITH_PADDING = Mma_tile_with_padding::MMAS_K };
+
+    // Assemble the mask.
+    enum { MASK = Compute_reset_mask<MMAS_K, MMAS_K_WITH_PADDING>::VALUE };
+
+    // Reset the read offset.
+    this->smem_read_offset_ ^= MASK * BYTES_PER_LDS * 2;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_b<Ampere_hmma_fp16_traits, Cta_tile, Col, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_ampere_col_b<Ampere_hmma_fp16_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Ampere_hmma_fp16_traits;
+  // The base class.
+  using Base = Smem_tile_ampere_col_b<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_b(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_b<Ampere_hmma_fp32_traits, Cta_tile, Col, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_ampere_col_b<Ampere_hmma_fp32_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Ampere_hmma_fp32_traits;
+  // The base class.
+  using Base = Smem_tile_ampere_col_b<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_b(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_b<Ampere_hmma_bf16_traits, Cta_tile, Col, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_ampere_col_b<Ampere_hmma_bf16_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Ampere_hmma_bf16_traits;
+  // The base class.
+  using Base = Smem_tile_ampere_col_b<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_b(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_b<Ampere_imma_int8_int32_traits, Cta_tile, Col, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_ampere_col_b<Ampere_imma_int8_int32_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Ampere_imma_int8_int32_traits;
+  // The base class.
+  using Base = Smem_tile_ampere_col_b<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_b(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_b<Ada_qmma_e4m3_fp32_traits, Cta_tile, Col, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_ampere_col_b<Ada_qmma_e4m3_fp32_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Ada_qmma_e4m3_fp32_traits;
+  // The base class.
+  using Base = Smem_tile_ampere_col_b<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_b(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_b<Ada_qmma_e4m3_fp16_traits, Cta_tile, Col, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_ampere_col_b<Ada_qmma_e4m3_fp16_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Ada_qmma_e4m3_fp16_traits;
+  // The base class.
+  using Base = Smem_tile_ampere_col_b<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_b(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, int N>
+struct Rows_per_xor_pattern_ampere_row_b : public Rows_per_xor_pattern_ampere_b<Traits, N> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction traits.
+    typename Traits,
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE,
+    // How many rows to use for the XOR pattern to avoid bank conflicts?
+    int ROWS_PER_XOR_PATTERN_ = Rows_per_xor_pattern_ampere_row_b<Traits, Cta_tile::N>::VALUE,
+    // How many cols to use for the XOR pattern to avoid bank conflicts?
+    int COLS_PER_XOR_PATTERN_ = 1>
+struct Smem_tile_ampere_row_b
+    : public Smem_tile_without_skews<Cta_tile, Cta_tile::K, Cta_tile::N, Traits::BITS_PER_ELEMENT_B,
+                                     BYTES_PER_STS, BUFFERS_PER_TILE, 0, ROWS_PER_XOR_PATTERN_,
+                                     COLS_PER_XOR_PATTERN_> {
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The base class.
+  using Base = Smem_tile_without_skews<Cta_tile, Cta_tile::K, Cta_tile::N,
+                                       Traits::BITS_PER_ELEMENT_B, BYTES_PER_STS, BUFFERS_PER_TILE,
+                                       0, ROWS_PER_XOR_PATTERN_, COLS_PER_XOR_PATTERN_>;
+  // The fragment.
+  using Fragment = Fragment_b<Traits, Row>;
+
+  // Can we use LDSM? No if the data type is 32-bit large.
+  enum { USE_LDSMT = Traits::BITS_PER_ELEMENT_B == 16 };
+
+  // The size of a single LDS in bytes.
+  enum { BYTES_PER_LDS = USE_LDSMT ? 16 : 4 };
+
+  // The number of elements per LDS.
+  enum { ELEMENTS_PER_LDS = BYTES_PER_LDS * 8 / Traits::BITS_PER_ELEMENT_B };
+
+  // The number of STS per thread
+  enum { STS_PER_THREAD_ = Base::ROWS * Base::THREADS_PER_ROW / Cta_tile::THREADS_PER_CTA };
+
+  // The number of STS per thread must be at least 1.
+  enum { STS_PER_THREAD = Max<1, STS_PER_THREAD_>::VALUE };
+
+  // Ctor.
+  inline __device__ Smem_tile_ampere_row_b(void* smem, int tidx) : Base(smem, tidx) {
+    // For documentation on the layout, see doc/xmma_smem_layout.xlsx.
+
+    // The number of warps.
+    int const WARPS_M = Cta_tile::WARPS_M;
+    int const WARPS_N = Cta_tile::WARPS_N;
+    int const WARPS_K = Cta_tile::WARPS_K;
+
+    // The masks to select the warps.
+    int const WARP_MASK_N = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::N;
+    int const WARP_MASK_K = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::K;
+
+    // The divisor for the warps.
+    int const WARP_DIV_N = WARPS_M * 1 * Cta_tile::THREADS_PER_WARP;
+    int const WARP_DIV_K = WARPS_M * WARPS_N * Cta_tile::THREADS_PER_WARP;
+
+    // The row/col read by the thread.
+    int smem_read_row, smem_read_col;
+
+    static_assert((USE_LDSMT && Base::ROWS_PER_XOR_PATTERN == 8) ||
+                      Base::ROWS_PER_XOR_PATTERN == 4 || Base::ROWS_PER_XOR_PATTERN == 2,
+                  "");
+
+    if (USE_LDSMT && Base::ROWS_PER_XOR_PATTERN == 8) {
+      // For group dgrad. B is divided into 2 halves along K dimension.
+      // The fist warp takes the first half and the second warp takes the second half.
+      smem_read_row =
+          (tidx & WARP_MASK_K) / WARP_DIV_K * Mma_tile::MMAS_K * 16 + (tidx & 0x07) + (tidx & 0x08);
+      smem_read_col = (tidx & 0x07);
+    } else if (USE_LDSMT && Base::ROWS_PER_XOR_PATTERN == 4) {
+      smem_read_row = (tidx & WARP_MASK_K) / WARP_DIV_K * Mma_tile::MMAS_K * 8 + (tidx & 0x06) / 2 +
+                      (tidx & 0x08) / 2;
+      smem_read_col = (tidx & 0x01) * 4 + (tidx & 0x06) / 2;
+    } else if (USE_LDSMT && Base::ROWS_PER_XOR_PATTERN == 2) {
+      smem_read_row = (tidx & WARP_MASK_K) / WARP_DIV_K * Mma_tile::MMAS_K * 4 + (tidx & 0x04) / 4 +
+                      (tidx & 0x08) / 4;
+      smem_read_col = (tidx & 0x03) * 2 + (tidx & 0x04) / 4;
+    } else if (Base::ROWS_PER_XOR_PATTERN == 4 && Base::COLS_PER_XOR_PATTERN == 2) {
+      smem_read_row = (tidx & WARP_MASK_K) / WARP_DIV_K * Mma_tile::MMAS_K * 8 + (tidx & 0x03);
+      smem_read_col = (tidx & 0x1c) / 4 + (tidx & 0x03) * 8;
+    }
+
+    // Each half-warp applies a different XOR pattern -- see the Excel document.
+    if (USE_LDSMT) {
+      smem_read_col ^= (tidx & WARP_MASK_N) / WARP_DIV_N * 2 + (tidx & 0x10) / 16;
+    } else {
+      smem_read_col ^= (tidx & WARP_MASK_N) / WARP_DIV_N * 16;
+    }
+
+    // The shared memory offset.
+    this->smem_read_offset_ = smem_read_row * Base::BYTES_PER_ROW + smem_read_col * BYTES_PER_LDS;
+
+    // Fill zeroes for group conv
+  }
+
+  // Rewind smem_read_offset for last LDS phase in main loop.
+  inline __device__ void reverse_smem_read_offset(int ki = 0) {
+    // The size of each element in bits.
+    int const BITS_PER_ELT = Traits::BITS_PER_ELEMENT_B;
+    // The size in bytes of the data needed to compute an MMA per CTA.
+    int const BYTES_PER_MMA_PER_CTA = Mma_tile::N_PER_MMA_PER_CTA * BITS_PER_ELT / 8;
+
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      // Undo the pointer increment for the next ni.
+      // Should match the load function below for ki = 0.
+      if (BYTES_PER_MMA_PER_CTA >= 128) {
+        // Nothing to do!
+      } else if (BYTES_PER_MMA_PER_CTA == 64 && Mma_tile::MMAS_N > 1) {
+        this->smem_read_offset_ ^= BYTES_PER_MMA_PER_CTA;
+      } else if (BYTES_PER_MMA_PER_CTA == 64) {
+        // Nothing to do!
+      } else if (BYTES_PER_MMA_PER_CTA == 32 && Mma_tile::MMAS_N == 4) {
+        this->smem_read_offset_ ^= BYTES_PER_LDS * (ni % 2 == 0 ? 2 : 6);
+      } else if (BYTES_PER_MMA_PER_CTA == 32 && Mma_tile::MMAS_N == 2) {
+        this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
+      }
+    }
+
+    // Reset smem_read_offset for odd MMAS_N > 1 (npo2 kernels)
+    if (BYTES_PER_MMA_PER_CTA == 64 && Mma_tile::MMAS_N > 1 && Mma_tile::MMAS_N % 2 == 1) {
+      this->smem_read_offset_ ^= BYTES_PER_MMA_PER_CTA;
+    }
+  }
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&b)[Mma_tile::VALID_MMAS_N], int ki) {
+    // The size of each element in bits.
+    int const BITS_PER_ELT = Traits::BITS_PER_ELEMENT_B;
+    // The size in bytes of the data needed to compute an MMA per CTA.
+    int const BYTES_PER_MMA_PER_CTA = Mma_tile::N_PER_MMA_PER_CTA * BITS_PER_ELT / 8;
+
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      // Prepare the offset.
+      int offset = ki * Base::ROWS_PER_XOR_PATTERN * 2 * Base::BYTES_PER_ROW;
+      if (BYTES_PER_MMA_PER_CTA == 32) {
+        offset += this->smem_read_offset_;
+      } else if (BYTES_PER_MMA_PER_CTA == 64) {
+        offset += this->smem_read_offset_ + (ni / 2) * BYTES_PER_MMA_PER_CTA * 2;
+      } else {
+        offset += this->smem_read_offset_ + (ni)*BYTES_PER_MMA_PER_CTA;
+      }
+
+      // Load the data using LDSM.MT88.2.
+      uint32_t ptr = this->smem_ + this->smem_read_buffer_ + offset;
+
+      if (ni < Mma_tile::VALID_MMAS_N) {
+        uint4 tmp;
+        if (USE_LDSMT) {
+          ldsmt(tmp, ptr);
+        } else {
+          lds(tmp.x, (ptr) + 0 * Base::BYTES_PER_ROW);
+          lds(tmp.y, (ptr) + 4 * Base::BYTES_PER_ROW);
+          lds(tmp.z, (ptr ^ 32) + 0 * Base::BYTES_PER_ROW);
+          lds(tmp.w, (ptr ^ 32) + 4 * Base::BYTES_PER_ROW);
+        }
+
+        // Store those values in the fragment.
+        b[ni].reg(0) = tmp.x;
+        b[ni].reg(1) = tmp.y;
+        b[ni].reg(2) = tmp.z;
+        b[ni].reg(3) = tmp.w;
+      }
+
+      // static_assert(BYTES_PER_MMA_PER_CTA >= 128 ||
+      //               BYTES_PER_MMA_PER_CTA ==  64 ||
+      //               (BYTES_PER_MMA_PER_CTA == 32 &&
+      //               (Mma_tile::MMAS_M == 4 ||
+      //               Mma_tile::MMAS_M == 2 ||
+      //               Mma_tile::MMAS_M == 1)), "");
+
+      // Move the pointer for the next ni. I expect the compiler to not recompute those.
+      if (BYTES_PER_MMA_PER_CTA >= 128) {
+        // Nothing to do!
+      } else if (BYTES_PER_MMA_PER_CTA == 64 && Mma_tile::MMAS_N > 1) {
+        this->smem_read_offset_ ^= BYTES_PER_MMA_PER_CTA;
+      } else if (BYTES_PER_MMA_PER_CTA == 64) {
+        // Nothing to do!
+      } else if (BYTES_PER_MMA_PER_CTA == 32) {
+        if ((ni & 1) == 0) {
+          this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
+        } else if (Mma_tile::MMAS_N >= 16 && (ni & 7) == 7) {
+          this->smem_read_offset_ ^= BYTES_PER_LDS * 30;
+        } else if (Mma_tile::MMAS_N >= 8 && (ni & 3) == 3) {
+          this->smem_read_offset_ ^= BYTES_PER_LDS * 14;
+        } else if (Mma_tile::MMAS_N >= 4 && (ni & 1) == 1) {
+          this->smem_read_offset_ ^= BYTES_PER_LDS * 6;
+        }
+      }
+    }
+
+    // Reset smem_read_offset for odd MMAS_N > 1 (npo2 kernels)
+    if (BYTES_PER_MMA_PER_CTA == 64 && Mma_tile::MMAS_N > 1 && Mma_tile::MMAS_N % 2 == 1) {
+      this->smem_read_offset_ ^= BYTES_PER_MMA_PER_CTA;
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_b<Ampere_hmma_fp32_traits, Cta_tile, Row, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_ampere_row_b<Ampere_hmma_fp32_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Ampere_hmma_fp32_traits;
+  // The base class.
+  using Base = Smem_tile_ampere_row_b<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_b(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_b<Ampere_hmma_bf16_traits, Cta_tile, Row, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_ampere_row_b<Ampere_hmma_bf16_traits, Cta_tile, BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+  // The traits class.
+  using Traits = Ampere_hmma_bf16_traits;
+  // The base class.
+  using Base = Smem_tile_ampere_row_b<Traits, Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_b(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/smem_tile_o.h b/csrc/fmha_v2/fmha/smem_tile_o.h
new file mode 100644
index 0000000000..af7311a111
--- /dev/null
+++ b/csrc/fmha_v2/fmha/smem_tile_o.h
@@ -0,0 +1,1646 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/fragment.h>
+#include <fmha/smem_tile.h>
+#include <fmha/traits.h>
+#include <fmha/utils.h>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// H M M A
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile>
+struct Smem_tile_o {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_o<Volta_hmma_fp16_16x16x16_traits, Cta_tile> {
+  // The instruction traits.
+  using Traits = Volta_hmma_fp16_16x16x16_traits;
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The accumulators.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+  // The accumulators.
+  using Data_type = typename Accumulator::Data_type;
+
+  // The size of each element.
+  enum { BYTES_PER_ELEMENT = sizeof(Data_type) };
+
+  // The size of each STS.
+  enum { BYTES_PER_STS = 16 };
+
+  // The size of each row in shared memory.
+  enum { BYTES_PER_ROW = Cta_tile::N * Cta_tile::WARPS_K * 2 * BYTES_PER_ELEMENT };
+
+  // The size of each LDS.
+  enum { BYTES_PER_LDS = 16 };
+
+  // The number of threads (to produce 16B per LDS).
+  enum { THREADS_PER_ROW = Cta_tile::N * BYTES_PER_ELEMENT / BYTES_PER_LDS };
+
+  // The number of rows loaded per LDS.
+  enum { ROWS_PER_LDS = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // The number of rows.
+  enum { ROWS = Cta_tile::M };
+
+  // We want at least one output per thread (if possible).
+  enum { ROWS_PER_LOOP_ = ROWS <= 64 ? ROWS : (int)Min<ROWS, ROWS_PER_LDS>::VALUE };
+
+  // We also want to have "complete" MMAs.
+  enum { ROWS_PER_LOOP = Max<ROWS_PER_LOOP_, Mma_tile::M_PER_MMA_PER_CTA>::VALUE };
+
+  // The number of outer loops.
+  enum { LOOPS = fmha::Div_up<ROWS, ROWS_PER_LOOP>::VALUE };
+
+  // Make sure it matches our expectations.
+  static_assert(ROWS_PER_LOOP >= (int)Mma_tile::M_PER_MMA_PER_CTA, "");
+
+  // Do we have to guard against partial writes/reads.
+  enum { HAS_INCOMPLETE_LDS = ROWS_PER_LOOP % ROWS_PER_LDS != 0 };
+
+  // The total number of LDS per loop.
+  enum { LDS_PER_LOOP = fmha::Div_up<ROWS_PER_LOOP, ROWS_PER_LDS>::VALUE };
+
+  // The amount of shared memory.
+  enum { BYTES_PER_TILE = ROWS_PER_LOOP * BYTES_PER_ROW };
+
+  // Warps.
+  enum { WARPS_M = Cta_tile::WARPS_M };
+
+  enum { WARPS_N = Cta_tile::WARPS_N };
+
+  enum { WARPS_K = Cta_tile::WARPS_K };
+
+  // Determine the config.
+  enum { WARPS_2x1x2 = WARPS_M == 2 && WARPS_N == 1 && WARPS_K == 2 };
+
+  enum { WARPS_1x1x8 = WARPS_M == 1 && WARPS_N == 1 && WARPS_K == 8 };
+
+  enum { WARPS_1x1x4 = WARPS_M == 1 && WARPS_N == 1 && WARPS_K == 4 };
+
+  // Flash Attention uses WARPS_4x1x1
+  enum { WARPS_4x1x1 = WARPS_M == 4 && WARPS_N == 1 && WARPS_K == 1 };
+
+  // Ctor.
+  inline __device__ Smem_tile_o(void* smem, int tidx) {
+    // Get a 32-bit value for the shared memory address.
+    uint32_t smem_ = __nvvm_get_smem_pointer(smem);
+
+    // The row/col written by the thread.
+    int write_row, write_col;
+
+    // SEQLEN == 128. Segments of 128B are written by 2 warps.
+    if (WARPS_2x1x2 && Cta_tile::N == 32) {
+      write_row = (tidx & 0x30) / 2 + (tidx & 0x07);
+      write_col = (tidx & 0x0f);
+      write_col ^= (tidx & 0x40) / 16;
+
+      // SEQLEN == 128 and N == 64.
+    } else if (WARPS_2x1x2 && Cta_tile::N == 64) {
+      write_row = (tidx & 0x30) / 2 + (tidx & 0x07);
+      write_col = (tidx & 0x40) / 8 + (tidx & 0x08) * 2 + (tidx & 0x07);
+
+      // SEQLEN == 256, 384 and N == 32. Segments of 128B are written by 2 warps.
+    } else if (WARPS_1x1x4 && Cta_tile::N == 32) {
+      write_row = (tidx & 0x10) / 2 + (tidx & 0x07);
+      write_col = (tidx & 0x40) / 8 + (tidx & 0x08) * 2 + (tidx & 0x07);
+      write_col ^= (tidx & 0x20) / 8;
+
+      // SEQLEN == 256, 384 and N == 64.
+    } else if (WARPS_1x1x4 && Cta_tile::N == 64) {
+      write_row = (tidx & 0x10) / 2 + (tidx & 0x07);
+      write_col = (tidx & 0x60) / 4 + (tidx & 0x08) * 4 + (tidx & 0x07);
+
+      // SEQLEN == 256, 384, 512 and N == 128.
+    } else if (WARPS_1x1x4 && Cta_tile::N == 128) {
+      write_row = (tidx & 0x10) / 2 + (tidx & 0x07);
+      write_col = (tidx & 0x60) / 2 + (tidx & 0x08) * 8 + (tidx & 0x07);
+
+      // SEQLEN == 256, 384, 512 and N == 256.
+    } else if (WARPS_1x1x4 && Cta_tile::N == 256) {
+      write_row = (tidx & 0x10) / 2 + (tidx & 0x07);
+      write_col = (tidx & 0x60) / 1 + (tidx & 0x08) * 16 + (tidx & 0x07);
+
+      // SEQLEN == 256, 384, 512 and N == 32. Segments of 128B are written by 2 warps.
+    } else if (WARPS_1x1x8 && Cta_tile::N == 32) {
+      write_row = (tidx & 0x10) / 2 + (tidx & 0x07);
+      write_col = (tidx & 0xc0) / 8 + (tidx & 0x08) * 4 + (tidx & 0x07);
+      write_col ^= (tidx & 0x20) / 8;
+
+      // SEQLEN == 256, 384, 512 and N == 64.
+    } else if (WARPS_1x1x8 && Cta_tile::N == 64) {
+      write_row = (tidx & 0x10) / 2 + (tidx & 0x07);
+      write_col = (tidx & 0xe0) / 4 + (tidx & 0x08) * 8 + (tidx & 0x07);
+
+      // ANY SEQLEN and N == 32
+    } else if (WARPS_4x1x1 && Cta_tile::N == 32) {
+      write_row = (tidx & 0xf0) / 2 + (tidx & 0x07);
+      write_col = (tidx & 0x07);
+      write_col ^= (tidx & 0x08) / 2;
+
+      // ANY SEQLEN and N == 64
+    } else if (WARPS_4x1x1 && Cta_tile::N == 64) {
+      write_row = (tidx & 0x70) / 2 + (tidx & 0x07);
+      write_col = (tidx & 0x0f);
+
+      // ANY SEQLEN and N == 128
+    } else if (WARPS_4x1x1 && Cta_tile::N == 128) {
+      write_row = (tidx & 0x70) / 2 + (tidx & 0x07);
+      write_col = (tidx & 0x08) + (tidx & 0x0f);
+
+      // ANY SEQLEN and N == 256
+    } else if (WARPS_4x1x1 && Cta_tile::N == 256) {
+      write_row = (tidx & 0x70) / 2 + (tidx & 0x07);
+      write_col = (tidx & 0x08) * 3 + (tidx & 0x0f);
+
+      // Not supported.
+    } else {
+      assert(false);
+    }
+
+    // Assemble the write pointer.
+    smem_write_ = smem_ + write_row * BYTES_PER_ROW + write_col * BYTES_PER_STS;
+
+    // The element read by each thread.
+    int read_row = tidx / THREADS_PER_ROW;
+    int read_col = tidx % THREADS_PER_ROW;
+
+    // Take the XOR pattern into account for the column.
+    read_col ^= read_row & 0x7;
+
+    // Assemble the read pointer.
+    smem_read_ = smem_ + read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+
+    // Is that thread active on the last LDS?
+    if (HAS_INCOMPLETE_LDS) {
+      is_active_for_last_lds_ = read_row + (LDS_PER_LOOP - 1) * ROWS_PER_LDS < Cta_tile::M;
+    }
+  }
+
+  // Load the output fragments.
+  inline __device__ void load(uint4 (&out)[LDS_PER_LOOP]) const {
+    uint32_t local_smem_read_ = smem_read_;
+#pragma unroll
+    for (int ii = 0; ii < LDS_PER_LOOP; ++ii) {
+      // Apply the XOR pattern if needed. (XOR 8 default)
+      if (ROWS_PER_LDS < 8) {
+        local_smem_read_ = (smem_read_ ^ ((ii * ROWS_PER_LDS) % 8 * BYTES_PER_LDS));
+      }
+
+      // Load the elements before the reduction (split-K).
+      uint4 tmp[Cta_tile::WARPS_K * 2];
+#pragma unroll
+      for (int jj = 0; jj < Cta_tile::WARPS_K * 2; ++jj) {
+        // The immediate.
+        int imm = ii * ROWS_PER_LDS * BYTES_PER_ROW;
+        if (Cta_tile::N == 256) {
+          imm += jj * 512;
+        } else if (Cta_tile::N == 128) {
+          imm += jj * 256;
+        } else if (Cta_tile::N == 64) {
+          imm += jj * 128;
+        } else if (Cta_tile::N == 32) {
+          imm += jj / 2 * 128;
+        } else {
+          assert(false);
+        }
+
+        // The XOR mask.
+        int smem_read_offset = local_smem_read_;
+        if (Cta_tile::N == 32 && (jj % 2) == 1) {
+          smem_read_offset ^= 64;
+        }
+
+        // Load...
+        if (!HAS_INCOMPLETE_LDS || (ii < LDS_PER_LOOP - 1 || is_active_for_last_lds_)) {
+          fmha::lds(tmp[jj], smem_read_offset + imm);
+        }
+      }
+
+      // Perform the reduction.
+      out[ii] = tmp[0];
+#pragma unroll
+      for (int jj = 1; jj < Cta_tile::WARPS_K * 2; ++jj) {
+        out[ii] = fmha::hadd8(out[ii], tmp[jj]);
+      }
+    }
+  }
+
+  // Store the accumulators.
+  template <int M, int N>
+  inline __device__ void store(Accumulator const (&acc)[M][N], int mi) {
+    enum { M_PER_MMA = Mma_tile::M_PER_MMA_PER_CTA };
+
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::VALID_MMAS_N; ++ni) {
+      // The number of MMAs that are stored per loop iteration.
+      enum { MMAS_M_PER_LOOP = Mma_tile::MMAS_M / LOOPS };
+
+// Store 1st column of the different MMAs.
+#pragma unroll
+      for (int mj = 0; mj < MMAS_M_PER_LOOP; ++mj) {
+        // Assemble the vectors for the stores. See how we swizzle the registers.
+        uint4 tmp_0;
+        tmp_0.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(0);
+        tmp_0.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(1);
+        tmp_0.z = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(4);
+        tmp_0.w = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(5);
+
+        uint4 tmp_1;
+        tmp_1.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(2);
+        tmp_1.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(3);
+        tmp_1.z = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(6);
+        tmp_1.w = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(7);
+
+        // Precompute the immediates to jump to the correct row.
+        int row = mj * M_PER_MMA * BYTES_PER_ROW;
+
+        // The columns.
+        int smem_write_0 = smem_write_ ^ ((2 * ni + 0) * BYTES_PER_STS);
+        int smem_write_1 = smem_write_ ^ ((2 * ni + 1) * BYTES_PER_STS);
+
+        // Store.
+        fmha::sts(smem_write_0 + row, tmp_0);
+        fmha::sts(smem_write_1 + row, tmp_1);
+      }
+    }
+  }
+
+  // The write pointer.
+  uint32_t smem_write_;
+  // The write pointer.
+  uint32_t smem_read_;
+  // Is the thread active for the last LDS of the series?
+  int is_active_for_last_lds_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// This class converts the FP16/FP32 inputs to FP16x2.
+
+struct Convert_from_fp16 {
+  // Convert one pair of fp16 numbers.
+  template <typename Accumulators>
+  static inline __device__ uint32_t convert(Accumulators const& acc, int ii) {
+    // Extract the 2x FP16 numbers (packed in a register).
+    uint32_t h2 = acc.reg(ii);
+
+    return h2;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Convert_from_fp32 {
+  // Convert one pair of fp16 numbers.
+  template <typename Accumulators>
+  inline __device__ uint32_t convert(Accumulators const& acc, int ii) {
+    // Extract the 2x floats.
+    float f0 = acc.elt(ii * 2 + 0);
+    float f1 = acc.elt(ii * 2 + 1);
+
+    // Convert to FP16x2.
+    return fmha::float2_to_half2(f0, f1);
+  }
+
+  // The bf16 accumulators (convert from fp32 to 2xbf16).
+  using Ampere_bf16_Accumulator = fmha::Fragment_accumulator<Ampere_hmma_bf16_traits>;
+
+  static inline __device__ uint32_t convert(Ampere_bf16_Accumulator const& acc, int ii) {
+    // Extract the 2x floats.
+    float f0 = acc.elt(ii * 2 + 0);
+    float f1 = acc.elt(ii * 2 + 1);
+
+    // Convert to FP16x2.
+    return fmha::float2_to_bf16_x2(f0, f1);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int BYTES_PER_STS_ = 4>
+struct Hmma_smem_tile_o {
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The accumulators.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+  // The data type.
+  using Data_type = typename Accumulator::Data_type;
+  // The epilogue data type
+  using Epilogue_type = typename Traits::Epilogue_type;
+
+  // The size of each element.
+  enum { BYTES_PER_ELEMENT = sizeof(Epilogue_type) };
+
+  // The amount of bytes per row (without packing or split-k).
+  enum { BYTES_PER_ROW = Cta_tile::N * BYTES_PER_ELEMENT };
+
+  // The size of each STS.
+  enum { BYTES_PER_STS = BYTES_PER_STS_ };
+
+  // The size of each LDS.
+  enum { BYTES_PER_LDS = 16 };
+
+  // The number of threads (to produce 16B per LDS).
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDS };
+
+  // The number of rows loaded per LDS.
+  enum { ROWS_PER_LDS = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // The number of rows in shared memory.
+  enum { ROWS = Cta_tile::M };
+
+  // We want at least one output per thread (if possible).
+  enum { ROWS_PER_LOOP_ = ROWS <= 64 ? ROWS : (int)Min<ROWS, ROWS_PER_LDS>::VALUE };
+
+  // We also want to have "complete" MMAs.
+  enum { ROWS_PER_LOOP = Max<ROWS_PER_LOOP_, Mma_tile::M_PER_MMA_PER_CTA>::VALUE };
+
+  // The number of outer loops.
+  enum { LOOPS = fmha::Div_up<ROWS, ROWS_PER_LOOP>::VALUE };
+
+  // Make sure it matches our expectations.
+  static_assert(ROWS_PER_LOOP >= (int)Mma_tile::M_PER_MMA_PER_CTA, "");
+
+  // Do we have to guard against partial writes/reads.
+  enum { HAS_INCOMPLETE_LDS = ROWS_PER_LOOP % ROWS_PER_LDS != 0 };
+
+  // The total number of LDS per loop.
+  enum { LDS_PER_LOOP = fmha::Div_up<ROWS_PER_LOOP, ROWS_PER_LDS>::VALUE };
+
+  // The amount of shared memory.
+  enum { BYTES_PER_TILE = ROWS_PER_LOOP * BYTES_PER_ROW * Cta_tile::WARPS_K };
+
+  // The amount of row packing to make sure we have at least 128B per smem row (without split-k).
+  enum { ROW_PACKING = Max<1, 128 / BYTES_PER_ROW>::VALUE };
+
+  // Make sure our row packing is correct
+  static_assert(ROWS_PER_LOOP % ROW_PACKING == 0, "");
+
+  // The amount of shared memory per row after packing.
+  enum { BYTES_PER_ROW_WITH_PACKING = BYTES_PER_ROW * ROW_PACKING };
+
+  // Make sure we have at least 128B per row after packing.
+  static_assert(BYTES_PER_ROW_WITH_PACKING >= 128, "");
+
+  // The number of threads per row after packing.
+  enum { THREADS_PER_ROW_WITH_PACKING = THREADS_PER_ROW * ROW_PACKING };
+
+  // Make sure we have at least 8 threads per row after packing.
+  static_assert(THREADS_PER_ROW_WITH_PACKING >= 8, "");
+
+  // Warps.
+  enum { WARPS_M = Cta_tile::WARPS_M };
+
+  enum { WARPS_N = Cta_tile::WARPS_N };
+
+  enum { WARPS_K = Cta_tile::WARPS_K };
+
+  // Determine the config.
+  enum { WARPS_2x1x2 = WARPS_M == 2 && WARPS_N == 1 && WARPS_K == 2 };
+
+  enum { WARPS_1x1x8 = WARPS_M == 1 && WARPS_N == 1 && WARPS_K == 8 };
+
+  enum { WARPS_1x1x4 = WARPS_M == 1 && WARPS_N == 1 && WARPS_K == 4 };
+
+  // Flash Attention uses WARPS_4x1x1
+  enum { WARPS_4x1x1 = WARPS_M == 4 && WARPS_N == 1 && WARPS_K == 1 };
+
+  enum { WARPS_4x1x2 = WARPS_M == 4 && WARPS_N == 1 && WARPS_K == 2 };
+
+  // Ctor.
+  inline __device__ Hmma_smem_tile_o(void* smem, int tidx) {
+    // Get a 32-bit value for the shared memory address.
+    uint32_t smem_ = __nvvm_get_smem_pointer(smem);
+
+    // The row/col written by the thread.
+    int write_row, write_col;
+
+    // SEQLEN == 128 and HIDDEN_SIZE_PER_HEAD == 16.
+    if (WARPS_2x1x2 && Cta_tile::N == 16) {
+      write_row = (tidx & 0x20) / 8 + (tidx & 0x10) / 16;
+      write_col = (tidx & 0x40) / 2 + (tidx & 0x0c) * 2 + (tidx & 0x03);
+      write_col ^= (tidx & 0x10) / 4;
+
+      // SEQLEN == 128 and HIDDEN_SIZE_PER_HEAD == 32.
+    } else if (WARPS_2x1x2 && Cta_tile::N == 32) {
+      write_row = (tidx & 0x20) / 4 + (tidx & 0x18) / 8;
+      write_col = (tidx & 0x40) / 2 + (tidx & 0x04) * 4 + (tidx & 0x03);
+      write_col ^= (tidx & 0x18) / 2;
+
+      // SEQLEN == 128 and HIDDEN_SIZE_PER_HEAD == 64.
+    } else if (WARPS_2x1x2 && Cta_tile::N == 64) {
+      write_row = (tidx & 0x20) / 2 + (tidx & 0x1c) / 4;
+      write_col = (tidx & 0x40) / 2 + (tidx & 0x03);
+      write_col ^= (tidx & 0x1c);
+
+      // SEQLEN == 128 and HIDDEN_SIZE_PER_HEAD == 128.
+    } else if (WARPS_2x1x2 && Cta_tile::N == 128) {
+      write_row = (tidx & 0x20) / 2 + (tidx & 0x1c) / 4;
+      write_col = (tidx & 0x40) / 1 + (tidx & 0x1f);
+
+      // SEQLEN == 256, 384, 512 and HIDDEN_SIZE_PER_HEAD == 16.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 16) {
+      write_row = (tidx & 0x10) / 16;
+      write_col = (tidx & 0x0c) * 2 + (tidx & 0xe3);
+      write_col ^= (tidx & 0x10) / 4;
+
+      // SEQLEN == 256, 384, 512 and HIDDEN_SIZE_PER_HEAD == 32.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 32) {
+      write_row = (tidx & 0x18) / 8;
+      write_col = (tidx & 0x04) * 4 + (tidx & 0xe3);
+      write_col ^= (tidx & 0x18) / 2;
+
+      // SEQLEN == 256, 384 and HIDDEN_SIZE_PER_HEAD == 64.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 64) {
+      write_row = (tidx & 0x1c) / 4;
+      write_col = (tidx & 0xff);
+
+      // SEQLEN == 256, 384 and HIDDEN_SIZE_PER_HEAD == 128.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 128) {
+      write_row = (tidx & 0x1c) / 4;
+      write_col = (tidx & 0xe0) * 2 + (tidx & 0x1f);
+
+      // SEQLEN == 256, 384 and HIDDEN_SIZE_PER_HEAD == 256.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 256) {
+      write_row = (tidx & 0x1c) / 4;
+      write_col = (tidx & 0xe0) * 4 + (tidx & 0x1f);
+
+      // ANY SEQLEN and HIDDEN_SIZE_PER_HEAD == 16.
+    } else if (WARPS_4x1x1 && Cta_tile::N == 16) {
+      write_row = (tidx & 0xe0) / 8 + (tidx & 0x10) / 16;
+      write_col = (tidx & 0x0c) * 2 + (tidx & 0x03);
+      write_col ^= (tidx & 0x10) / 4;
+
+      // ANY SEQLEN and HIDDEN_SIZE_PER_HEAD == 32.
+    } else if (WARPS_4x1x1 && Cta_tile::N == 32) {
+      write_row = (tidx & 0xe0) / 4 + (tidx & 0x18) / 8;
+      write_col = (tidx & 0x04) * 4 + (tidx & 0x03);
+      write_col ^= (tidx & 0x18) / 2;
+
+      // ANY SEQLEN and HIDDEN_SIZE_PER_HEAD == 64/128.
+    } else if (WARPS_4x1x1 && (Cta_tile::N == 64 || Cta_tile::N == 128)) {
+      write_row = (tidx & 0x60) / 2 + (tidx & 0x1c) / 4;
+      write_col = (tidx & 0x1f);
+
+      // ANY SEQLEN and HIDDEN_SIZE_PER_HEAD == 256.
+    } else if (WARPS_4x1x1 && (Cta_tile::N == 256 || Cta_tile::N == 512)) {
+      write_row = (tidx & 0x60) / 2 + (tidx & 0x1c) / 4;
+      write_col = (tidx & 0x1f);
+
+      // GMMA: S=284/512 and HIDDEN_SIZE_PER_HEAD == 64.
+    } else if (WARPS_4x1x2 && Cta_tile::N == 64) {
+      write_row = (tidx & 0x60) / 2 + (tidx & 0x1c) / 4;
+      write_col = (tidx & 0x80) / 4 + (tidx & 0x03);
+      write_col ^= (tidx & 0x1c);
+
+      // GMMA: S=284/512 and HIDDEN_SIZE_PER_HEAD == 64.
+    } else if (WARPS_4x1x2 && Cta_tile::N == 32) {
+      write_row = (tidx & 0x60) / 4 + (tidx & 0x1c) / 8;
+      write_col = (tidx & 0x80) / 4 + (tidx & 0x04) * 4 + (tidx & 0x03);
+      write_col ^= (tidx & 0x18) / 2;
+
+      // Not supported.
+    } else {
+      assert(false);
+    }
+
+    // Assemble the write pointer.
+    smem_write_ = smem_ + write_row * BYTES_PER_ROW_WITH_PACKING * Cta_tile::WARPS_K +
+                  write_col * BYTES_PER_STS;
+
+    // The element read by each thread.
+    int read_row = tidx / THREADS_PER_ROW;
+    int read_col = tidx % THREADS_PER_ROW;
+
+    // Is that thread active on the last LDS?
+    if (HAS_INCOMPLETE_LDS) {
+      is_active_for_last_lds_ = read_row + (LDS_PER_LOOP - 1) * ROWS_PER_LDS < ROWS_PER_LOOP;
+    }
+
+    // The XOR params.
+    int const XOR_MOD = 8 / ROW_PACKING;
+
+    // Take the XOR pattern and the packing into account for the column.
+    read_col += read_row % ROW_PACKING * XOR_MOD;
+    read_row /= ROW_PACKING;
+    read_col ^= read_row % XOR_MOD;
+
+    // Assemble the read pointer.
+    smem_read_ = smem_ + read_row * BYTES_PER_ROW_WITH_PACKING * Cta_tile::WARPS_K +
+                 read_col * BYTES_PER_LDS;
+  }
+
+  // Load the output fragments.
+  inline __device__ void load(uint4 (&out)[LDS_PER_LOOP]) const {
+    uint32_t local_smem_read_ = smem_read_;
+#pragma unroll
+    for (int ii = 0; ii < LDS_PER_LOOP; ++ii) {
+      // Apply the XOR pattern if needed. (XOR 8 default)
+      if (ROWS_PER_LDS < 8) {
+        local_smem_read_ = (smem_read_ ^ ((ii * ROWS_PER_LDS) % 8 * BYTES_PER_LDS));
+      }
+
+      // Load the elements before the reduction (split-K).
+      uint4 tmp[Cta_tile::WARPS_K];
+#pragma unroll
+      for (int jj = 0; jj < Cta_tile::WARPS_K; ++jj) {
+        // Note: ROWS_PER_LDS does not take packing into account - hence BYTES_PER_ROW.
+        int imm =
+            ii * ROWS_PER_LDS * BYTES_PER_ROW * Cta_tile::WARPS_K + jj * BYTES_PER_ROW_WITH_PACKING;
+
+        // Load...
+        if (!HAS_INCOMPLETE_LDS || (ii < LDS_PER_LOOP - 1 || is_active_for_last_lds_)) {
+          fmha::lds(tmp[jj], local_smem_read_ + imm);
+        }
+      }
+
+      // Perform the reduction.
+      out[ii] = tmp[0];
+#pragma unroll
+      for (int jj = 1; jj < Cta_tile::WARPS_K; ++jj) {
+        out[ii] = fmha::add8<Epilogue_type>(out[ii], tmp[jj]);
+      }
+    }
+  }
+
+  // Store the accumulators.
+  template <typename Converter, int M, int N, typename Accumulators>
+  inline __device__ void store_(Accumulators const (&acc)[M][N], int mi) {
+    enum { M_PER_MMA = Mma_tile::M_PER_MMA_PER_CTA };
+
+    Converter converter;
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      // The number of MMAs that are stored per loop iteration.
+      enum { MMAS_M_PER_LOOP = Mma_tile::MMAS_M / LOOPS };
+
+      // Store 1st column of the different MMAs.
+      // Skip N paddings
+      if (ni < Mma_tile::VALID_MMAS_N) {
+#pragma unroll
+        for (int mj = 0; mj < MMAS_M_PER_LOOP; ++mj) {
+          // Precompute the immediates to jump between rows.
+          int row_0 = (mj * M_PER_MMA + 0) * BYTES_PER_ROW * Cta_tile::WARPS_K;
+          int row_1 = (mj * M_PER_MMA + 8) * BYTES_PER_ROW * Cta_tile::WARPS_K;
+
+          // The values (2 halves per register).
+          uint32_t h0 = converter.convert(acc[mi * MMAS_M_PER_LOOP + mj][ni], 0);
+          uint32_t h1 = converter.convert(acc[mi * MMAS_M_PER_LOOP + mj][ni], 1);
+
+          // Store to shared memory.
+          fmha::sts(smem_write_ + row_0, h0);
+          fmha::sts(smem_write_ + row_1, h1);
+        }
+      }
+
+      // Swizzle the write pointer using a XOR of 16B.
+      smem_write_ ^= 16;
+
+      // Store 2nd column of the different MMAs.
+      // Skip N paddings
+      if (ni < Mma_tile::VALID_MMAS_N) {
+#pragma unroll
+        for (int mj = 0; mj < MMAS_M_PER_LOOP; ++mj) {
+          // Precompute the immediates to jump between rows.
+          int row_0 = (mj * M_PER_MMA + 0) * BYTES_PER_ROW * Cta_tile::WARPS_K;
+          int row_1 = (mj * M_PER_MMA + 8) * BYTES_PER_ROW * Cta_tile::WARPS_K;
+
+          // The values (2 halves per register).
+          uint32_t h2 = converter.convert(acc[mi * MMAS_M_PER_LOOP + mj][ni], 2);
+          uint32_t h3 = converter.convert(acc[mi * MMAS_M_PER_LOOP + mj][ni], 3);
+
+          // Store to shared memory.
+          fmha::sts(smem_write_ + row_0, h2);
+          fmha::sts(smem_write_ + row_1, h3);
+        }
+      }
+
+      // Cancel the previous XOR of 1 + swizzle the write pointer using a XOR of 32B or 64B.
+      if (ROW_PACKING == 4) {
+        smem_write_ ^= 16;
+      } else if (ROW_PACKING == 2) {
+        smem_write_ ^= 3 * 16;
+      } else if (ROW_PACKING == 1) {
+        //         7
+        //       /    \
+                //      3      3
+        //    /  \    /  \
+                //   1    1  1    1
+        static_assert(Mma_tile::MMAS_N <= 64, "");
+        if (Mma_tile::MMAS_N >= 32 && ni % 16 == 15) {
+          smem_write_ ^= 63 * 16;
+        } else if (Mma_tile::MMAS_N >= 16 && ni % 8 == 7) {
+          smem_write_ ^= 31 * 16;
+        } else if (Mma_tile::MMAS_N >= 8 && ni % 4 == 3) {
+          smem_write_ ^= 15 * 16;
+        } else if (Mma_tile::MMAS_N >= 4 && ni % 2 == 1) {
+          smem_write_ ^= 7 * 16;
+        } else if (Mma_tile::MMAS_N >= 2) {
+          smem_write_ ^= 3 * 16;
+        }
+      } else {
+        assert(false);
+      }
+    }
+  }
+
+  // The write pointer.
+  uint32_t smem_write_;
+  // The write pointer.
+  uint32_t smem_read_;
+  // Is the thread active for the last LDS of the series?
+  int is_active_for_last_lds_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_o<fmha::Turing_hmma_fp16_traits, Cta_tile>
+    : public Hmma_smem_tile_o<fmha::Turing_hmma_fp16_traits, Cta_tile> {
+  // The traits class.
+  using Traits = fmha::Turing_hmma_fp16_traits;
+  // The base class.
+  using Base = Hmma_smem_tile_o<Traits, Cta_tile>;
+
+  // The FP16 accumulators.
+  using Accumulators_fp16 = fmha::Fragment_accumulator<fmha::Turing_hmma_fp16_traits>;
+  // The FP32 accumulators.
+  using Accumulators_fp32 = fmha::Fragment_accumulator<fmha::Turing_hmma_fp32_traits>;
+
+  // Ctor.
+  inline __device__ Smem_tile_o(void* smem, int tidx) : Base(smem, tidx) {}
+
+  // Store from FP16 accumulators. That's the default.
+  template <int M, int N>
+  inline __device__ void store(Accumulators_fp16 const (&acc)[M][N], int mi) {
+    this->template store_<Convert_from_fp16>(acc, mi);
+  }
+
+  // Store from FP32 accumulators. Special trick for the Flash-attention kernel.
+  // Convert from fp32 to fp16 before STS
+  template <int M, int N>
+  inline __device__ void store(Accumulators_fp32 const (&acc)[M][N], int mi) {
+    this->template store_<Convert_from_fp32>(acc, mi);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_o<fmha::Ampere_hmma_fp16_traits, Cta_tile>
+    : public Hmma_smem_tile_o<fmha::Ampere_hmma_fp16_traits, Cta_tile> {
+  // The traits class.
+  using Traits = fmha::Ampere_hmma_fp16_traits;
+  // The base class.
+  using Base = Hmma_smem_tile_o<Traits, Cta_tile>;
+
+  // The FP16 accumulators.
+  using Accumulators_fp16 = fmha::Fragment_accumulator<fmha::Ampere_hmma_fp16_traits>;
+  // The FP32 accumulators.
+  using Accumulators_fp32 = fmha::Fragment_accumulator<fmha::Ampere_hmma_fp32_traits>;
+
+  // Ctor.
+  inline __device__ Smem_tile_o(void* smem, int tidx) : Base(smem, tidx) {}
+
+  // Store from FP16 accumulators. That's the default.
+  template <int M, int N>
+  inline __device__ void store(Accumulators_fp16 const (&acc)[M][N], int mi) {
+    this->template store_<Convert_from_fp16>(acc, mi);
+  }
+
+  // Store from FP32 accumulators. Special trick for the Flash-attention kernel.
+  // Convert from fp32 to fp16 before STS
+  template <int M, int N>
+  inline __device__ void store(Accumulators_fp32 const (&acc)[M][N], int mi) {
+    this->template store_<Convert_from_fp32>(acc, mi);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_o<fmha::Ampere_hmma_bf16_bf16_traits, Cta_tile>
+    : public Hmma_smem_tile_o<fmha::Ampere_hmma_bf16_bf16_traits, Cta_tile> {
+  // The traits class.
+  using Traits = fmha::Ampere_hmma_bf16_bf16_traits;
+  // The base class.
+  using Base = Hmma_smem_tile_o<Traits, Cta_tile>;
+
+  // The FP32 accumulators (only FP32 acc is supported for BF16 MMA).
+  using Accumulators_bf16 = fmha::Fragment_accumulator<fmha::Ampere_hmma_bf16_traits>;
+
+  // Ctor.
+  inline __device__ Smem_tile_o(void* smem, int tidx) : Base(smem, tidx) {}
+
+  // Store from FP32 accumulators. Special trick for the Flash-attention kernel.
+  // Convert from fp32 to bf16 before STS
+  template <int M, int N>
+  inline __device__ void store(Accumulators_bf16 const (&acc)[M][N], int mi) {
+    this->template store_<Convert_from_fp32>(acc, mi);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_o<fmha::Ampere_hmma_fp32_traits, Cta_tile>
+    : public Hmma_smem_tile_o<fmha::Ampere_hmma_fp32_traits, Cta_tile, 8> {
+  // The traits class.
+  using Traits = fmha::Ampere_hmma_fp32_traits;
+  // The base class.
+  using Base = Hmma_smem_tile_o<Traits, Cta_tile, 8>;
+  // The MMA tile.
+  using Mma_tile = typename Base::Mma_tile;
+  // The accumulators.
+  using Accumulator = typename Base::Accumulator;
+
+  // The size of each
+  enum { BYTES_PER_ELEMENT = Base::BYTES_PER_ELEMENT };
+
+  // The size of each row in shared memory.
+  enum { BYTES_PER_ROW = Base::BYTES_PER_ROW * Cta_tile::WARPS_K };
+
+  // The size of each row in shared memory.
+  enum { BYTES_PER_LDS = Base::BYTES_PER_LDS };
+
+  // The number of threads (to produce 16B per LDS).
+  enum { THREADS_PER_ROW = Base::THREADS_PER_ROW };
+
+  // The number of outer loops.
+  enum { LOOPS = Base::LOOPS };
+
+  // The number of rows loaded per LDS.
+  enum { ROWS_PER_LDS = Base::ROWS_PER_LDS };
+
+  // Do we have to guard against partial writes/reads.
+  enum { HAS_INCOMPLETE_LDS = Base::HAS_INCOMPLETE_LDS };
+
+  // The total number of LDS per loop.
+  enum { LDS_PER_LOOP = Base::LDS_PER_LOOP };
+
+  // Ctor.
+  inline __device__ Smem_tile_o(void* smem, int tidx) : Base(smem, tidx) {
+    // Get a 32-bit value for the shared memory address.
+    uint32_t smem_ = __nvvm_get_smem_pointer(smem);
+
+    // The element read by each thread.
+    int read_row = tidx / THREADS_PER_ROW;
+    int read_col = tidx % THREADS_PER_ROW;
+
+    // Take the XOR pattern into account for the column.
+    read_col ^= (read_row & 0x7) * 2;
+
+    // Assemble the read pointer.
+    this->smem_read_ = smem_ + read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+
+    // Is that thread active on the last LDS?
+    if (HAS_INCOMPLETE_LDS) {
+      this->is_active_for_last_lds_ = read_row + (LDS_PER_LOOP - 1) * ROWS_PER_LDS < Cta_tile::M;
+    }
+  }
+
+  // Load the output fragments.
+  inline __device__ void load(uint4 (&out)[LDS_PER_LOOP]) const {
+#pragma unroll
+    for (int ii = 0; ii < LDS_PER_LOOP; ++ii) {
+      // Load the elements before the reduction (split-K).
+      uint4 tmp[Cta_tile::WARPS_K];
+#pragma unroll
+      for (int jj = 0; jj < Cta_tile::WARPS_K; ++jj) {
+        int imm = ii * ROWS_PER_LDS * BYTES_PER_ROW + jj * Cta_tile::N * BYTES_PER_ELEMENT;
+        int is_valid = ii < LDS_PER_LOOP - 1 || this->is_active_for_last_lds_;
+        if (!HAS_INCOMPLETE_LDS || is_valid) {
+          fmha::lds(tmp[jj], this->smem_read_ + imm);
+        }
+      }
+
+      // Perform the reduction.
+      out[ii] = tmp[0];
+#pragma unroll
+      for (int jj = 1; jj < Cta_tile::WARPS_K; ++jj) {
+        out[ii] = fmha::fadd4(out[ii], tmp[jj]);
+      }
+    }
+  }
+
+  // Store the accumulators.
+  template <int M, int N>
+  inline __device__ void store(Accumulator const (&acc)[M][N], int mi) {
+    enum { M_PER_MMA = Mma_tile::M_PER_MMA_PER_CTA };
+
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      // The number of MMAs that are stored per loop iteration.
+      enum { MMAS_M_PER_LOOP = Mma_tile::MMAS_M / LOOPS };
+
+      // Store 1st column of the different MMAs.
+      if (ni < Mma_tile::VALID_MMAS_N) {
+#pragma unroll
+        for (int mj = 0; mj < MMAS_M_PER_LOOP; ++mj) {
+          // Precompute the immediates to jump between rows.
+          int row_0 = (mj * M_PER_MMA + 0) * BYTES_PER_ROW;
+          int row_1 = (mj * M_PER_MMA + 8) * BYTES_PER_ROW;
+
+          // Pack vectors.
+          uint2 tmp0;
+          tmp0.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(0);
+          tmp0.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(1);
+
+          uint2 tmp1;
+          tmp1.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(2);
+          tmp1.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(3);
+
+          // Store.
+          fmha::sts(this->smem_write_ + row_0, tmp0);
+          fmha::sts(this->smem_write_ + row_1, tmp1);
+        }
+      }
+
+      // Swizzle the write pointer using a XOR of 16B.
+      this->smem_write_ ^= 32;
+
+      // Store 2nd column of the different MMAs.
+      if (ni < Mma_tile::VALID_MMAS_N) {
+#pragma unroll
+        for (int mj = 0; mj < MMAS_M_PER_LOOP; ++mj) {
+          // Precompute the immediates to jump between rows.
+          int row_0 = (mj * M_PER_MMA + 0) * BYTES_PER_ROW;
+          int row_1 = (mj * M_PER_MMA + 8) * BYTES_PER_ROW;
+
+          uint2 tmp0, tmp1;
+          tmp0.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(4);
+          tmp0.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(5);
+
+          tmp1.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(6);
+          tmp1.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(7);
+
+          // Store.
+          fmha::sts(this->smem_write_ + row_0, tmp0);
+          fmha::sts(this->smem_write_ + row_1, tmp1);
+        }
+      }
+
+      // Cancel the previous XOR of 1 + swizzle the write pointer using a XOR of 32B or 64B.
+      static_assert(Mma_tile::MMAS_N <= 16, "");
+      if (Mma_tile::MMAS_N >= 16 && (ni & 7) == 7) {
+        this->smem_write_ ^= 31 * 32;
+      } else if (Mma_tile::MMAS_N >= 8 && (ni & 3) == 3) {
+        this->smem_write_ ^= 15 * 32;
+      } else if (Mma_tile::MMAS_N >= 4 && (ni & 1) == 1) {
+        this->smem_write_ ^= 7 * 32;
+      } else if ((ni & 1) == 0) {
+        this->smem_write_ ^= 3 * 32;
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_o<fmha::Ampere_hmma_bf16_traits, Cta_tile>
+    : public Hmma_smem_tile_o<fmha::Ampere_hmma_bf16_traits, Cta_tile, 8> {
+  // The traits class.
+  using Traits = fmha::Ampere_hmma_bf16_traits;
+  // The base class.
+  using Base = Hmma_smem_tile_o<Traits, Cta_tile, 8>;
+  // The MMA tile.
+  using Mma_tile = typename Base::Mma_tile;
+  // The accumulators.
+  using Accumulator = typename Base::Accumulator;
+
+  // The size of each element.
+  enum { BYTES_PER_ELEMENT = Base::BYTES_PER_ELEMENT };
+
+  // The size of each row in shared memory.
+  enum { BYTES_PER_ROW = Base::BYTES_PER_ROW * Cta_tile::WARPS_K };
+
+  // The size of each row in shared memory.
+  enum { BYTES_PER_LDS = Base::BYTES_PER_LDS };
+
+  // The number of threads (to produce 16B per LDS).
+  enum { THREADS_PER_ROW = Base::THREADS_PER_ROW };
+
+  // The number of outer loops.
+  enum { LOOPS = Base::LOOPS };
+
+  // The number of rows loaded per LDS.
+  enum { ROWS_PER_LDS = Base::ROWS_PER_LDS };
+
+  // Do we have to guard against partial writes/reads.
+  enum { HAS_INCOMPLETE_LDS = Base::HAS_INCOMPLETE_LDS };
+
+  // The total number of LDS per loop.
+  enum { LDS_PER_LOOP = Base::LDS_PER_LOOP };
+
+  // Ctor.
+  inline __device__ Smem_tile_o(void* smem, int tidx) : Base(smem, tidx) {
+    // Get a 32-bit value for the shared memory address.
+    uint32_t smem_ = __nvvm_get_smem_pointer(smem);
+
+    // The element read by each thread.
+    int read_row = tidx / THREADS_PER_ROW;
+    int read_col = tidx % THREADS_PER_ROW;
+
+    // Take the XOR pattern into account for the column.
+    read_col ^= (read_row & 0x7) * 2;
+
+    // Assemble the read pointer.
+    this->smem_read_ = smem_ + read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+
+    // Is that thread active on the last LDS?
+    if (HAS_INCOMPLETE_LDS) {
+      this->is_active_for_last_lds_ = read_row + (LDS_PER_LOOP - 1) * ROWS_PER_LDS < Cta_tile::M;
+    }
+  }
+
+  // Load the output fragments.
+  inline __device__ void load(uint4 (&out)[LDS_PER_LOOP]) const {
+#pragma unroll
+    for (int ii = 0; ii < LDS_PER_LOOP; ++ii) {
+      // Load the elements before the reduction (split-K).
+      uint4 tmp[Cta_tile::WARPS_K];
+#pragma unroll
+      for (int jj = 0; jj < Cta_tile::WARPS_K; ++jj) {
+        int imm = ii * ROWS_PER_LDS * BYTES_PER_ROW + jj * Cta_tile::N * BYTES_PER_ELEMENT;
+        int is_valid = ii < LDS_PER_LOOP - 1 || this->is_active_for_last_lds_;
+        if (!HAS_INCOMPLETE_LDS || is_valid) {
+          fmha::lds(tmp[jj], this->smem_read_ + imm);
+        }
+      }
+
+      // Perform the reduction.
+      out[ii] = tmp[0];
+#pragma unroll
+      for (int jj = 1; jj < Cta_tile::WARPS_K; ++jj) {
+        out[ii] = fmha::fadd4(out[ii], tmp[jj]);
+      }
+    }
+  }
+
+  // Store the accumulators.
+  template <int M, int N>
+  inline __device__ void store(Accumulator const (&acc)[M][N], int mi) {
+    enum { M_PER_MMA = Mma_tile::M_PER_MMA_PER_CTA };
+
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      // The number of MMAs that are stored per loop iteration.
+      enum { MMAS_M_PER_LOOP = Mma_tile::MMAS_M / LOOPS };
+
+      // Store 1st column of the different MMAs.
+      if (ni < Mma_tile::VALID_MMAS_N) {
+#pragma unroll
+        for (int mj = 0; mj < MMAS_M_PER_LOOP; ++mj) {
+          // Precompute the immediates to jump between rows.
+          int row_0 = (mj * M_PER_MMA + 0) * BYTES_PER_ROW;
+          int row_1 = (mj * M_PER_MMA + 8) * BYTES_PER_ROW;
+
+          // Pack vectors.
+          uint2 tmp0;
+          tmp0.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(0);
+          tmp0.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(1);
+
+          uint2 tmp1;
+          tmp1.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(2);
+          tmp1.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(3);
+
+          // Store.
+          fmha::sts(this->smem_write_ + row_0, tmp0);
+          fmha::sts(this->smem_write_ + row_1, tmp1);
+        }
+      }
+
+      // Swizzle the write pointer using a XOR of 16B.
+      this->smem_write_ ^= 32;
+
+      // Store 2nd column of the different MMAs.
+      if (ni < Mma_tile::VALID_MMAS_N) {
+#pragma unroll
+        for (int mj = 0; mj < MMAS_M_PER_LOOP; ++mj) {
+          // Precompute the immediates to jump between rows.
+          int row_0 = (mj * M_PER_MMA + 0) * BYTES_PER_ROW;
+          int row_1 = (mj * M_PER_MMA + 8) * BYTES_PER_ROW;
+
+          uint2 tmp0, tmp1;
+          tmp0.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(4);
+          tmp0.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(5);
+
+          tmp1.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(6);
+          tmp1.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(7);
+
+          // Store.
+          fmha::sts(this->smem_write_ + row_0, tmp0);
+          fmha::sts(this->smem_write_ + row_1, tmp1);
+        }
+      }
+
+      // Cancel the previous XOR of 1 + swizzle the write pointer using a XOR of 32B or 64B.
+      static_assert(Mma_tile::MMAS_N <= 16, "");
+      if ((ni & 1) == 0) {
+        this->smem_write_ ^= 3 * 32;
+      } else if (Mma_tile::MMAS_N >= 16 && (ni & 7) == 7) {
+        this->smem_write_ ^= 31 * 32;
+      } else if (Mma_tile::MMAS_N >= 8 && (ni & 3) == 3) {
+        this->smem_write_ ^= 15 * 32;
+      } else if (Mma_tile::MMAS_N >= 4 && (ni & 1) == 1) {
+        this->smem_write_ ^= 7 * 32;
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// I M M A
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// each thread holds 8 accumulator registers per 16x16 MMA, representing a 2x4 tile
+template <typename Traits>
+struct Regs_to_rows {
+  template <typename Acc>
+  static inline __device__ void extract(Acc const& acc, uint4& row0, uint4& row1) {
+    // Volta/Turing: row-major
+    uint32_t tmp_00 = acc.reg(0);
+    uint32_t tmp_01 = acc.reg(2);
+    uint32_t tmp_02 = acc.reg(1);
+    uint32_t tmp_03 = acc.reg(3);
+    uint32_t tmp_10 = acc.reg(4);
+    uint32_t tmp_11 = acc.reg(6);
+    uint32_t tmp_12 = acc.reg(5);
+    uint32_t tmp_13 = acc.reg(7);
+
+    row0.x = tmp_00;
+    row0.y = tmp_01;
+    row0.z = tmp_02;
+    row0.w = tmp_03;
+
+    row1.x = tmp_10;
+    row1.y = tmp_11;
+    row1.z = tmp_12;
+    row1.w = tmp_13;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Regs_to_rows_8bit {
+  template <typename Acc>
+  static inline __device__ void extract(Acc const& acc, uint4& row0, uint4& row1) {
+    // Ampere: col-major
+    uint32_t tmp_00 = acc.reg(0);
+    uint32_t tmp_01 = acc.reg(4);
+    uint32_t tmp_02 = acc.reg(1);
+    uint32_t tmp_03 = acc.reg(5);
+    uint32_t tmp_10 = acc.reg(2);
+    uint32_t tmp_11 = acc.reg(6);
+    uint32_t tmp_12 = acc.reg(3);
+    uint32_t tmp_13 = acc.reg(7);
+
+    row0.x = tmp_00;
+    row0.y = tmp_01;
+    row0.z = tmp_02;
+    row0.w = tmp_03;
+
+    row1.x = tmp_10;
+    row1.y = tmp_11;
+    row1.z = tmp_12;
+    row1.w = tmp_13;
+  }
+};
+
+template <>
+struct Regs_to_rows<fmha::Ampere_imma_int8_int32_traits> : public Regs_to_rows_8bit {};
+
+template <>
+struct Regs_to_rows<fmha::Ada_qmma_e4m3_fp32_traits> : public Regs_to_rows_8bit {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Regs_to_rows<fmha::Ada_qmma_e4m3_fp16_traits> {
+  template <typename Acc>
+  static inline __device__ void extract(Acc const& acc, uint2& row0, uint2& row1) {
+    uint16_t* row0_ptr = reinterpret_cast<uint16_t*>(&row0);
+    uint16_t* row1_ptr = reinterpret_cast<uint16_t*>(&row1);
+    row0_ptr[0] = acc.u16(0);
+    row0_ptr[1] = acc.u16(4);
+    row0_ptr[2] = acc.u16(1);
+    row0_ptr[3] = acc.u16(5);
+
+    row1_ptr[0] = acc.u16(2);
+    row1_ptr[1] = acc.u16(6);
+    row1_ptr[2] = acc.u16(3);
+    row1_ptr[3] = acc.u16(7);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Acc>
+inline __device__ void add4(uint4& dst, uint4 const& src) {
+  reinterpret_cast<Acc&>(dst.x) += reinterpret_cast<Acc const&>(src.x);
+  reinterpret_cast<Acc&>(dst.y) += reinterpret_cast<Acc const&>(src.y);
+  reinterpret_cast<Acc&>(dst.z) += reinterpret_cast<Acc const&>(src.z);
+  reinterpret_cast<Acc&>(dst.w) += reinterpret_cast<Acc const&>(src.w);
+}
+
+template <typename Acc>
+inline __device__ void add_vec(uint4& dst, uint4 const& src) {
+  add4<Acc>(dst, src);
+}
+
+template <>
+inline __device__ void add_vec<uint16_t>(uint4& dst, uint4 const& src) {
+  dst = fmha::hadd8(dst, src);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// The base class for 32-bit/16-bit accumulator types of imma/qmma.
+// TODO Can we port Ampere hmma fp32 to this?
+template <typename Traits, typename Cta_tile>
+struct Smem_tile_o_base_8bit_mma {
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The accumulators.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // The size of each element.
+  enum { BYTES_PER_ELEMENT = sizeof(typename Traits::Accumulator_type) };
+
+  // The amount of bytes per row (without packing or split-k).
+  enum { BYTES_PER_ROW = Cta_tile::N * BYTES_PER_ELEMENT };
+
+  // The size of each STS.
+  enum { BYTES_PER_STS = BYTES_PER_ELEMENT * 4 };
+
+  // The STS Packed Data Type
+  using Sts_packed_type = typename Uint_from_size_in_bytes<BYTES_PER_STS>::Type;
+
+  // The size of each LDS.
+  enum { BYTES_PER_LDS = 16 };
+
+  // The number of threads to store a "row" of the matrix. We force it to 16 for SEQLEN=384.
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDS };
+
+  // The number of rows loaded per LDS.
+  enum { ROWS_PER_LDS = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  // The STS bytes for one quad of threads
+  enum { BYTES_PER_STS_PER_QUAD = BYTES_PER_STS * 4 };
+
+  // The xor factor per LDS
+  // (4 consecutive threads do 64B swizzle for 16B per sts, 32B swizzle for 8B per sts)
+  enum { XOR_FACTOR = fmha::Div_up<BYTES_PER_STS * 4, BYTES_PER_LDS>::VALUE };
+
+  // The smem offset in bytes per MMA_N (2 squad threads)
+  enum { BYTES_OFFSET_PER_MMA_N = BYTES_PER_STS * 8 };
+
+  // The number of "rows" to process in total.
+  enum { ROWS = Cta_tile::M };
+
+  // We want at least one output per thread (if possible).
+  enum { ROWS_PER_LOOP_ = ROWS <= 64 ? ROWS : (int)Min<ROWS, ROWS_PER_LDS>::VALUE };
+
+  // We also want to have "complete" MMAs.
+  enum { ROWS_PER_LOOP = Max<ROWS_PER_LOOP_, Mma_tile::M_PER_MMA_PER_CTA>::VALUE };
+
+  // The number of outer loops.
+  enum { LOOPS = fmha::Div_up<ROWS, ROWS_PER_LOOP>::VALUE };
+
+  // Make sure it matches our expectations.
+  static_assert(ROWS_PER_LOOP >= (int)Mma_tile::M_PER_MMA_PER_CTA, "");
+
+  // Do we have to guard against partial writes/reads.
+  enum { HAS_INCOMPLETE_LDS = ROWS_PER_LOOP % ROWS_PER_LDS != 0 };
+
+  // The total number of LDS per loop.
+  enum { LDS_PER_LOOP = fmha::Div_up<ROWS_PER_LOOP, ROWS_PER_LDS>::VALUE };
+
+  // The amount of shared memory.
+  enum { BYTES_PER_TILE = ROWS_PER_LOOP * BYTES_PER_ROW * Cta_tile::WARPS_K };
+
+  // The amount of row packing to make sure we have at least 128B per smem row (without split-k).
+  enum { ROW_PACKING = Max<1, 128 / BYTES_PER_ROW>::VALUE };
+
+  // Make sure our row packing is correct
+  static_assert(ROWS_PER_LOOP % ROW_PACKING == 0, "");
+
+  // The amount of shared memory per row after packing.
+  enum { BYTES_PER_ROW_WITH_PACKING = BYTES_PER_ROW * ROW_PACKING };
+
+  // Make sure we have at least 128B per row after packing.
+  static_assert(BYTES_PER_ROW_WITH_PACKING >= 128, "");
+
+  // The number of threads per row after packing.
+  enum { THREADS_PER_ROW_WITH_PACKING = THREADS_PER_ROW * ROW_PACKING };
+
+  // Make sure we have at least 8 threads per row after packing.
+  static_assert(THREADS_PER_ROW_WITH_PACKING >= 8, "");
+
+  // Warps.
+  enum { WARPS_M = Cta_tile::WARPS_M };
+
+  enum { WARPS_N = Cta_tile::WARPS_N };
+
+  enum { WARPS_K = Cta_tile::WARPS_K };
+
+  static_assert(WARPS_K > 1 || std::is_same<Traits, Ada_qmma_e4m3_fp32_traits>::value,
+                "Kernel misconfigured. No split-k needed.");
+
+  // Determine the config.
+  enum { WARPS_2x1x2 = WARPS_M == 2 && WARPS_N == 1 && WARPS_K == 2 };
+
+  enum { WARPS_4x1x2 = WARPS_M == 4 && WARPS_N == 1 && WARPS_K == 2 };
+
+  enum { WARPS_1x1x8 = WARPS_M == 1 && WARPS_N == 1 && WARPS_K == 8 };
+
+  enum { WARPS_1x1x4 = WARPS_M == 1 && WARPS_N == 1 && WARPS_K == 4 };
+
+  enum { WARPS_4x1x1 = WARPS_M == 4 && WARPS_N == 1 && WARPS_K == 1 };
+
+  // Ctor.
+  inline __device__ Smem_tile_o_base_8bit_mma(void* smem, int tidx) {
+    // Get a 32-bit value for the shared memory address.
+    uint32_t smem_ = __nvvm_get_smem_pointer(smem);
+
+    // The row/col written by the thread.
+    int write_row, write_col;
+
+    // SEQLEN == 128 and HIDDEN_SIZE_PER_HEAD == 16.
+    if (WARPS_2x1x2 && Cta_tile::N == 16) {
+      write_row = (tidx & 0x20) / 4 + (tidx & 0x1e) / 8;
+      write_col = (tidx & 0x40) / 8 + (tidx & 0x07);
+
+      // SEQLEN == 128 and HIDDEN_SIZE_PER_HEAD == 32.
+    } else if (WARPS_2x1x2 && Cta_tile::N == 32) {
+      write_row = (tidx & 0x20) / 2 + (tidx & 0x1c) / 4;
+      write_col = (tidx & 0x40) / 8 + (tidx & 0x07);
+
+      // SEQLEN == 128 and HIDDEN_SIZE_PER_HEAD == 64.
+    } else if (WARPS_2x1x2 && Cta_tile::N == 64) {
+      write_row = (tidx & 0x20) / 2 + (tidx & 0x1c) / 4;
+      write_col = (tidx & 0x40) / 4 + (tidx & 0x07);
+
+      // SEQLEN == 256, 384, 512 and HIDDEN_SIZE_PER_HEAD == 16.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 16) {
+      write_row = (tidx & 0x18) / 8;
+      write_col = (tidx & 0xe0) / 4 + (tidx & 0x07);
+
+      // SEQLEN == 256, 384, 512 and HIDDEN_SIZE_PER_HEAD == 32.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 32) {
+      write_row = (tidx & 0x1c) / 4;
+      write_col = (tidx & 0xe0) / 4 + (tidx & 0x07);
+
+      // SEQLEN == 256, 384 and HIDDEN_SIZE_PER_HEAD == 64.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 64) {
+      write_row = (tidx & 0x1c) / 4;
+      write_col = (tidx & 0xe0) / 2 + (tidx & 0x07);
+
+      // GMMA: HIDDEN_SIZE_PER_HEAD == 64.
+    } else if (WARPS_4x1x2 && Cta_tile::N == 64) {
+      write_row = (tidx & 0x60) / 2 + (tidx & 0x1c) / 4;
+      write_col = (tidx & 0x80) / 8 + (tidx & 0x07);
+
+      // Ada e4m3_fp32
+    } else if (WARPS_4x1x1) {
+      write_row = (tidx & 0x60) / 2 + (tidx & 0x1c) / 4;
+      write_col = (tidx & 0x80) / 8 + (tidx & 0x07);
+
+      // Not supported.
+    } else {
+      assert(false);
+    }
+
+    // Assemble the write pointer.
+    smem_write_ = smem_ + write_row * BYTES_PER_ROW_WITH_PACKING * Cta_tile::WARPS_K +
+                  write_col * BYTES_PER_STS;
+
+    // The element read by each thread.
+    int read_row = tidx / THREADS_PER_ROW;
+    int read_col = tidx % THREADS_PER_ROW;
+
+    // Is that thread active on the last LDS?
+    if (HAS_INCOMPLETE_LDS) {
+      is_active_for_last_lds_ = read_row + (LDS_PER_LOOP - 1) * ROWS_PER_LDS < ROWS_PER_LOOP;
+    }
+
+    // The XOR params.
+    constexpr int XOR_MOD = 2 / ROW_PACKING;
+
+    // Take the XOR pattern and the packing into account for the column.
+    read_col += read_row % ROW_PACKING * XOR_FACTOR;
+    read_row /= ROW_PACKING;
+    read_col ^= (read_row % XOR_MOD) * XOR_FACTOR;
+
+    // Assemble the read pointer.
+    smem_read_ = smem_ + read_row * BYTES_PER_ROW_WITH_PACKING * Cta_tile::WARPS_K +
+                 read_col * BYTES_PER_LDS;
+  }
+
+  // Load the output fragments.
+  inline __device__ void load(uint4 (&out)[LDS_PER_LOOP]) const {
+#pragma unroll
+    for (int ii = 0; ii < LDS_PER_LOOP; ++ii) {
+      // Load the elements before the reduction (split-K).
+      uint4 tmp[Cta_tile::WARPS_K];
+#pragma unroll
+      for (int jj = 0; jj < Cta_tile::WARPS_K; ++jj) {
+        // Note: ROWS_PER_LDS does not take packing into account - hence BYTES_PER_ROW.
+        int imm =
+            ii * ROWS_PER_LDS * BYTES_PER_ROW * Cta_tile::WARPS_K + jj * BYTES_PER_ROW_WITH_PACKING;
+
+        // Load...
+        if (!HAS_INCOMPLETE_LDS || (ii < LDS_PER_LOOP - 1 || is_active_for_last_lds_)) {
+          fmha::lds(tmp[jj], smem_read_ + imm);
+        }
+      }
+
+// Perform the reduction.
+#pragma unroll
+      for (int jj = 1; jj < Cta_tile::WARPS_K; ++jj) {
+        add_vec<Traits::Accumulator_type>(tmp[0], tmp[jj]);
+      }
+
+      // Write to out.
+      out[ii] = tmp[0];
+    }
+  }
+
+  // Store the accumulators.
+  template <int M, int N>
+  inline __device__ void store(Accumulator const (&acc)[M][N], int mi) {
+    enum { M_PER_MMA = Mma_tile::M_PER_MMA_PER_CTA };
+
+    // The number of MMAs that are stored per loop iteration.
+    enum { MMAS_M_PER_LOOP = Mma_tile::MMAS_M / LOOPS };
+
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+#pragma unroll
+      for (int mj = 0; mj < MMAS_M_PER_LOOP; ++mj) {
+        Sts_packed_type row_0, row_1;
+        Regs_to_rows<Traits>::extract(acc[mi * MMAS_M_PER_LOOP + mj][ni], row_0, row_1);
+
+        /*
+        (32bit acc) Each thread of a quad writes 16B per STS -> 64B per store.
+                    Account for 2 -> 128B.
+        (16bit acc) Each thread of a quad writes 8B per STS -> 32B per store.
+                    Account for 2 -> 64B.
+        */
+        int imm_0 = (mj * M_PER_MMA + 0) * BYTES_PER_ROW * Cta_tile::WARPS_K +
+                    (ni / 2) * BYTES_OFFSET_PER_MMA_N;
+        int imm_1 = (mj * M_PER_MMA + 8) * BYTES_PER_ROW * Cta_tile::WARPS_K +
+                    (ni / 2) * BYTES_OFFSET_PER_MMA_N;
+
+        // Store the elements.
+        fmha::sts(this->smem_write_ + imm_0, row_0);
+        fmha::sts(this->smem_write_ + imm_1, row_1);
+      }
+      // (32bit acc) Each thread of a quad writes 16B per STS -> 64B per store.
+      // (16bit acc) Each thread of a quad writes 8B per STS -> 32B per store.
+      if (Mma_tile::MMAS_N == 1) {
+        // Noop.
+      } else if (Mma_tile::MMAS_N % 2 == 0) {
+        this->smem_write_ ^= BYTES_PER_STS_PER_QUAD;
+      } else {
+        assert(false && "Unsupported");
+      }
+    }
+  }
+
+  // The write pointer.
+  uint32_t smem_write_;
+  // The write pointer.
+  uint32_t smem_read_;
+  // Is the thread active for the last LDS of the series?
+  int is_active_for_last_lds_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_o<fmha::Volta_imma_int8_int32_traits, Cta_tile>
+    : public Smem_tile_o_base_8bit_mma<fmha::Volta_imma_int8_int32_traits, Cta_tile> {
+  // The traits class.
+  using Traits = fmha::Volta_imma_int8_int32_traits;
+  // The base class.
+  using Base = Smem_tile_o_base_8bit_mma<Traits, Cta_tile>;
+
+  // Ctor.
+  inline __device__ Smem_tile_o(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_o<fmha::Turing_imma_int8_int32_traits, Cta_tile>
+    : public Smem_tile_o_base_8bit_mma<fmha::Turing_imma_int8_int32_traits, Cta_tile> {
+  // The traits class.
+  using Traits = fmha::Turing_imma_int8_int32_traits;
+  // The base class.
+  using Base = Smem_tile_o_base_8bit_mma<Traits, Cta_tile>;
+
+  // Ctor.
+  inline __device__ Smem_tile_o(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_o<fmha::Ampere_imma_int8_int32_traits, Cta_tile>
+    : public Smem_tile_o_base_8bit_mma<fmha::Ampere_imma_int8_int32_traits, Cta_tile> {
+  // The traits class.
+  using Traits = fmha::Ampere_imma_int8_int32_traits;
+  // The base class.
+  using Base = Smem_tile_o_base_8bit_mma<Traits, Cta_tile>;
+
+  // Ctor.
+  inline __device__ Smem_tile_o(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_o<fmha::Ada_qmma_e4m3_fp32_traits, Cta_tile>
+    : public Smem_tile_o_base_8bit_mma<fmha::Ada_qmma_e4m3_fp32_traits, Cta_tile> {
+  // The traits class.
+  using Traits = fmha::Ada_qmma_e4m3_fp32_traits;
+  // The base class.
+  using Base = Smem_tile_o_base_8bit_mma<Traits, Cta_tile>;
+
+  // Ctor.
+  inline __device__ Smem_tile_o(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_o<fmha::Ada_qmma_e4m3_fp16_traits, Cta_tile>
+    : public Smem_tile_o_base_8bit_mma<fmha::Ada_qmma_e4m3_fp16_traits, Cta_tile> {
+  // The traits class.
+  using Traits = fmha::Ada_qmma_e4m3_fp16_traits;
+  // The base class.
+  using Base = Smem_tile_o_base_8bit_mma<Traits, Cta_tile>;
+
+  // Ctor.
+  inline __device__ Smem_tile_o(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile>
+struct Smem_tile_o_interleaved {
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The accumulators.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  enum { VEC = 32 };
+
+  enum { NUM_SLICES = Cta_tile::N / VEC };
+
+  static_assert(NUM_SLICES == 1 || NUM_SLICES == 2, "");
+
+  enum { BYTES_PER_ELEMENT = 4 };
+
+  enum { BYTES_PER_STS = 16 };
+
+  enum { BYTES_PER_LDS = 16 };
+
+  enum { ELTS_PER_STS = BYTES_PER_STS / BYTES_PER_ELEMENT };
+
+  static_assert(VEC * BYTES_PER_ELEMENT == 128, "");
+
+  enum { BYTES_PER_ROW = Cta_tile::WARPS_K * VEC * BYTES_PER_ELEMENT };
+
+  // Each row only stores one slice. The other slice starts this many rows below
+  enum { ROWS_PER_SLICE = Cta_tile::WARPS_M * 16 };
+
+  enum { TOTAL_ROWS = NUM_SLICES * ROWS_PER_SLICE };
+
+  enum { BYTES_PER_TILE = BYTES_PER_ROW * TOTAL_ROWS };
+
+  // LDS
+  enum { THREADS_PER_ROW = 8 };
+
+  enum { ROWS_PER_LDS = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+
+  enum { LDS_PER_LOOP = TOTAL_ROWS / ROWS_PER_LDS };
+
+  // Ctor.
+  inline __device__ Smem_tile_o_interleaved(void* smem, int tidx) {
+    smem_ = __nvvm_get_smem_pointer(smem);
+
+    constexpr int WARPS_M = Cta_tile::WARPS_M;
+    constexpr int WARPS_N = Cta_tile::WARPS_N;
+    constexpr int WARPS_K = Cta_tile::WARPS_K;
+
+    // Warp order (fastest to slowest): m => n => k
+    // 2x2: 2,2,1 then 2,1,2: mask_m = 0x20, mask_k = 0x40, div_m = 32, div_k = 64
+    // 1x4: 1,4,1 then 1,1,4: mask_m = 0x00, mask_k = 0x60, div_m =  X, div_k = 32
+    // 1x8: 1,8,1 then 1,1,8: mask_m = 0x00, mask_k = 0xe0, div_m =  X, div_k = 32
+    static_assert(WARPS_N == 1, "");
+
+    // A thread holds 4 elts of 4B. One slice of 32 elts has 128B.
+    // Two MMAs in N constitute one slice
+
+    // the slice offset that depends on ni and has to be added later
+    static_assert(VEC / ELTS_PER_STS == 8, "");  // 8 columns of 4 elements
+    if (WARPS_M == 2 && WARPS_N == 1 && WARPS_K == 2) {
+      write_row = (tidx & 0x1c) / 4 + (tidx & 0x20) / 2;  // warp_m * 16 rows
+      write_col = (tidx & 0x03) + (tidx & 0x40) / 8;      // warp_k * VEC / ELTS_PER_STS
+    } else {
+      write_row = (tidx & 0x1c) / 4;
+      write_col = (tidx & 0x03) + (tidx & 0xe0) / 4;  // warp_k * VEC / ELTS_PER_STS
+    }
+    write_col ^= (write_row & 0x01) * 4;  // left or right 64B
+
+    // this->smem_write_ = smem_ + write_row * BYTES_PER_ROW + write_col * BYTES_PER_STS;
+
+    int read_row = tidx / THREADS_PER_ROW;
+    int read_col = tidx % THREADS_PER_ROW;
+    read_col ^= (read_row & 0x01) * 4;
+    this->smem_read_ = smem_ + read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+  }
+
+  // Store the accumulators.
+  template <int M, int N>
+  inline __device__ void store(Accumulator const (&acc)[M][N], int mi) {
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      int const slice = ni / NUM_SLICES;
+      int col = write_col ^ ((ni & 1) * 4);
+
+      uint32_t smem_write_ = smem_ + write_row * BYTES_PER_ROW + col * BYTES_PER_STS;
+
+      // Extract the elements.
+      uint4 row_0, row_1;
+
+      Regs_to_rows<Traits>::extract(acc[mi][ni], row_0, row_1);
+
+      // Each thread of a quad writes 16B per STS -> 64B per store. Account for
+      // 2 -> 128B.
+      int imm_0 = (slice * ROWS_PER_SLICE + 0) * BYTES_PER_ROW;
+      int imm_1 = (slice * ROWS_PER_SLICE + 8) * BYTES_PER_ROW;
+
+      // Store the elements.
+      fmha::sts(smem_write_ + imm_0, row_0);
+      fmha::sts(smem_write_ + imm_1, row_1);
+    }
+  }
+
+  // Load the output fragments.
+  inline __device__ void load(uint4 (&out)[LDS_PER_LOOP]) const {
+#pragma unroll
+    for (int ii = 0; ii < LDS_PER_LOOP; ++ii) {
+      // Load the elements before the reduction (split-K).
+      uint4 tmp[Cta_tile::WARPS_K];
+#pragma unroll
+      for (int jj = 0; jj < Cta_tile::WARPS_K; ++jj) {
+        int imm = ii * ROWS_PER_LDS * BYTES_PER_ROW + jj * VEC * BYTES_PER_ELEMENT;
+        fmha::lds(tmp[jj], smem_read_ + imm);
+      }
+
+// Perform the reduction.
+#pragma unroll
+      for (int jj = 1; jj < Cta_tile::WARPS_K; ++jj) {
+        add4<Traits::Accumulator_type>(tmp[0], tmp[jj]);
+      }
+
+      // Write to out.
+      out[ii] = tmp[0];
+    }
+  }
+
+  int write_row;
+  int write_col;
+  uint32_t smem_write_;
+  uint32_t smem_read_;
+  uint32_t smem_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/smem_tile_qkv.h b/csrc/fmha_v2/fmha/smem_tile_qkv.h
new file mode 100644
index 0000000000..32caaadb3a
--- /dev/null
+++ b/csrc/fmha_v2/fmha/smem_tile_qkv.h
@@ -0,0 +1,592 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/smem_tile.h>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Traits_, int ROWS, int COLS>
+struct Smem_tile_qkv_interleaved
+    : public fmha::Smem_tile_without_skews<Cta_tile,
+                                           ROWS,  // Cta_tile::M * 2,
+                                           COLS,  // Cta_tile::K / 2,
+                                           8,     // bits per element
+                                           16,    // bytes per STS
+                                           1,     // buffers per tile
+                                           0,     // enable lds fast path
+                                           2,  // ROWS PER XOR: 2 enough since we have 4 rows of the
+                                           // 8x16 LDSM mat in one SMEM row
+                                           1  // cols per xor
+                                           > {
+  // The traits class.
+  using Traits = Traits_;
+  // The base class.
+  using Base = fmha::Smem_tile_without_skews<Cta_tile,
+                                             ROWS,  // Cta_tile::M * 2,
+                                             COLS,  // Cta_tile::K / 2,
+                                             8,     // bits per element
+                                             16,    // bytes per STS
+                                             1,     // buffers per tile
+                                             0,     // enable lds fast path
+                                             2,  // ROWS PER XOR: 2 enough since we have 4 rows of
+                                             // the 8x16 LDSM mat in one SMEM row
+                                             1  // cols per xor
+                                             >;
+
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The fragment.
+
+  // The size of a single LDS in bytes.
+  enum { BYTES_PER_LDS = 16 };
+
+  enum { ROWS_PER_WARP = Cta_tile::THREADS_PER_WARP / Base::THREADS_PER_ROW };
+
+  using Fragment_a = fmha::Fragment_a<Traits_, fmha::Row>;
+  using Fragment_b = fmha::Fragment_b<Traits_, fmha::Col>;
+
+  inline __device__ Smem_tile_qkv_interleaved(char* smem, int tidx) : Base(smem, tidx) {}
+
+  uint32_t offset;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Traits_>
+struct Smem_tile_qk_interleaved_a_base : public Smem_tile_qkv_interleaved<Cta_tile, Traits_,
+                                                                          Cta_tile::M * 2,  // ROWS
+                                                                          Cta_tile::K / 2   // COLS
+                                                                          > {
+  using Base = Smem_tile_qkv_interleaved<Cta_tile, Traits_,
+                                         Cta_tile::M * 2,  // ROWS
+                                         Cta_tile::K / 2   // COLS
+                                         >;
+
+  static_assert(Base::THREADS_PER_ROW == 128 / 16, "");
+
+  enum { SMEM_ROWS_PER_WARP = Base::ROWS_PER_WARP };
+
+  static_assert(SMEM_ROWS_PER_WARP == 4, "");
+
+  using Mma_tile = typename Base::Mma_tile;
+  using Fragment = typename Base::Fragment_a;
+
+  inline __device__ Smem_tile_qk_interleaved_a_base(char* smem, int tidx) : Base(smem, tidx) {
+    static_assert(Cta_tile::WARPS_K == 1, "");
+    static_assert(Cta_tile::WARPS_M == 1 || Cta_tile::WARPS_M == 2, "");
+    static_assert(Cta_tile::WARPS_N == 2 || Cta_tile::WARPS_N == 4, "");
+
+    constexpr int WARPS_M = Cta_tile::WARPS_M;
+    constexpr int WARPS_N = Cta_tile::WARPS_N;
+    constexpr int WARPS_K = Cta_tile::WARPS_K;
+
+    constexpr int WARP_MASK_M = fmha::Warp_masks<WARPS_M, WARPS_N, WARPS_K>::M;
+    constexpr int WARP_DIV_M = 1 * 1 * Cta_tile::THREADS_PER_WARP;
+
+    int const warp_m = (tidx & WARP_MASK_M) / WARP_DIV_M;
+
+    /* Read address layout for ldsm:
+     * [ 0 16  1 17  2 18  3 19]
+     * [20  4 21  5 22  6 23  7]
+     * [ 8 24  9 25 10 26 11 27]
+     * [28 12 29 13 30 14 31 15]
+     */
+    int read_row = (tidx & 0x04) / 4 + (tidx & 0x08) / 4 + warp_m * SMEM_ROWS_PER_WARP;
+    int read_col = (tidx & 0x03) * 2 + (tidx & 0x10) / 16;
+    read_col ^= (read_row & 0x01);
+
+    this->offset = read_row * Base::BYTES_PER_ROW + read_col * Base::BYTES_PER_LDS;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile>
+struct Smem_tile_qk_interleaved_a {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_qk_interleaved_a<fmha::Volta_imma_int8_int32_traits, Cta_tile>
+    : public Smem_tile_qk_interleaved_a_base<Cta_tile, fmha::Volta_imma_int8_int32_traits> {
+  using Traits = fmha::Volta_imma_int8_int32_traits;
+  using Base = Smem_tile_qk_interleaved_a_base<Cta_tile, Traits>;
+  using Mma_tile = typename Base::Mma_tile;
+  using Fragment = typename Base::Fragment;
+
+  inline __device__ Smem_tile_qk_interleaved_a(char* smem, int tidx) : Base(smem, tidx) {}
+
+  inline __device__ void load(Fragment (&frag)[Mma_tile::MMAS_M], int ki) {
+    int slice = ki / 2;
+
+#pragma unroll
+    for (int mi = 0; mi < Mma_tile::MMAS_M; mi++) {
+      // the data for the second slice sits below the first slice
+      uint32_t read_ptr = this->smem_ + this->offset + slice * Base::ROWS * Base::BYTES_PER_ROW / 2;
+      uint2 data;
+      ldsm_with_lds(
+          data, read_ptr + mi * Cta_tile::WARPS_M * Base::SMEM_ROWS_PER_WARP * Base::BYTES_PER_ROW);
+      static_assert(Fragment::NUM_REGS == 2, "");
+      frag[mi].reg(0) = data.x;
+      frag[mi].reg(1) = data.y;
+    }
+
+    this->offset ^= 16;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_qk_interleaved_a<fmha::Turing_imma_int8_int32_traits, Cta_tile>
+    : public Smem_tile_qk_interleaved_a_base<Cta_tile, fmha::Turing_imma_int8_int32_traits> {
+  using Traits = fmha::Turing_imma_int8_int32_traits;
+  using Base = Smem_tile_qk_interleaved_a_base<Cta_tile, Traits>;
+  using Mma_tile = typename Base::Mma_tile;
+  using Fragment = typename Base::Fragment;
+
+  inline __device__ Smem_tile_qk_interleaved_a(char* smem, int tidx) : Base(smem, tidx) {}
+
+  inline __device__ void load(Fragment (&frag)[Mma_tile::MMAS_M], int ki) {
+    int slice = ki / 2;
+
+#pragma unroll
+    for (int mi = 0; mi < Mma_tile::MMAS_M; mi++) {
+      // the data for the second slice sits below the first slice
+      uint32_t read_ptr = this->smem_ + this->offset + slice * Base::ROWS * Base::BYTES_PER_ROW / 2;
+      uint2 data;
+      fmha::ldsm(
+          data, read_ptr + mi * Cta_tile::WARPS_M * Base::SMEM_ROWS_PER_WARP * Base::BYTES_PER_ROW);
+      static_assert(Fragment::NUM_REGS == 2, "");
+      frag[mi].reg(0) = data.x;
+      frag[mi].reg(1) = data.y;
+    }
+
+    this->offset ^= 16;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_qk_interleaved_a<fmha::Ampere_imma_int8_int32_traits, Cta_tile>
+    : public Smem_tile_qk_interleaved_a_base<Cta_tile, fmha::Ampere_imma_int8_int32_traits> {
+  using Traits = fmha::Ampere_imma_int8_int32_traits;
+  using Base = Smem_tile_qk_interleaved_a_base<Cta_tile, Traits>;
+  using Mma_tile = typename Base::Mma_tile;
+  using Fragment = typename Base::Fragment;
+
+  inline __device__ Smem_tile_qk_interleaved_a(char* smem, int tidx) : Base(smem, tidx) {}
+
+  inline __device__ void load(Fragment (&frag)[Mma_tile::MMAS_M], int ki) {
+#pragma unroll
+    for (int mi = 0; mi < Mma_tile::MMAS_M; mi++) {
+      // the data for the second slice sits below the first slice
+      uint32_t read_ptr = this->smem_ + this->offset + ki * Base::ROWS * Base::BYTES_PER_ROW / 2;
+      uint4 data;
+      fmha::ldsm(
+          data, read_ptr + mi * Cta_tile::WARPS_M * Base::SMEM_ROWS_PER_WARP * Base::BYTES_PER_ROW);
+      static_assert(Fragment ::NUM_REGS == 4, "");
+      frag[mi].reg(0) = data.x;
+      frag[mi].reg(1) = data.y;
+      frag[mi].reg(2) = data.z;
+      frag[mi].reg(3) = data.w;
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Traits_>
+struct Smem_tile_qk_interleaved_b_base : public Smem_tile_qkv_interleaved<Cta_tile, Traits_,
+                                                                          Cta_tile::N * 2,  // ROWS
+                                                                          Cta_tile::K / 2   // COLS
+                                                                          > {
+  using Base = Smem_tile_qkv_interleaved<Cta_tile, Traits_,
+                                         Cta_tile::N * 2,  // ROWS
+                                         Cta_tile::K / 2   // COLS
+                                         >;
+
+  using Mma_tile = typename Base::Mma_tile;
+  using Fragment = typename Base::Fragment_b;
+
+  inline __device__ Smem_tile_qk_interleaved_b_base(char* smem, int tidx) : Base(smem, tidx) {
+    constexpr int WARPS_M = Cta_tile::WARPS_M;
+    constexpr int WARPS_N = Cta_tile::WARPS_N;
+    constexpr int WARPS_K = Cta_tile::WARPS_K;
+
+    // 2x2: 2,2,1 then 2,1,2
+    // 1x4: 1,4,1 then 1,1,4
+    static_assert(WARPS_K == 1, "");
+
+    constexpr int WARP_MASK_N = fmha::Warp_masks<WARPS_M, WARPS_N, WARPS_K>::N;
+    constexpr int WARP_DIV_N = WARPS_M * 1 * Cta_tile::THREADS_PER_WARP;
+
+    // Only need to care about warp_n, because if warps_m > 1, both of them should load
+    // the same data
+    int const warp = (tidx & WARP_MASK_N) / WARP_DIV_N;
+
+    /* transpose the order of the LDSMs: first along K, then along N
+     * [ 0  8  1  9  2 10  3 11]
+     * [12  4 13  5 14  6 15  7]
+     * [16 24 17 25 18 26 19 27]
+     * [28 20 29 21 30 22 31 23]
+     */
+    int read_row = (tidx & 0x04) / 4 + (tidx & 0x10) / 8 + warp * Base::ROWS_PER_WARP;
+    int read_col = (tidx & 0x03) * 2 + (tidx & 0x08) / 8;
+    read_col ^= (read_row & 0x01);
+
+    this->offset = read_row * Base::BYTES_PER_ROW + read_col * Base::BYTES_PER_LDS;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits_, typename Cta_tile>
+struct Smem_tile_qk_interleaved_b {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_qk_interleaved_b<fmha::Volta_imma_int8_int32_traits, Cta_tile>
+    : public Smem_tile_qk_interleaved_b_base<Cta_tile, fmha::Volta_imma_int8_int32_traits> {
+  using Traits = fmha::Volta_imma_int8_int32_traits;
+  using Base = Smem_tile_qk_interleaved_b_base<Cta_tile, Traits>;
+
+  using Mma_tile = typename Base::Mma_tile;
+  using Fragment = typename Base::Fragment;
+
+  inline __device__ Smem_tile_qk_interleaved_b(char* smem, int tidx) : Base(smem, tidx) {
+    constexpr int WARPS_M = Cta_tile::WARPS_M;
+    constexpr int WARPS_N = Cta_tile::WARPS_N;
+    constexpr int WARPS_K = Cta_tile::WARPS_K;
+    constexpr int WARP_MASK_N = fmha::Warp_masks<WARPS_M, WARPS_N, WARPS_K>::N;
+    constexpr int WARP_DIV_N = WARPS_M * 1 * Cta_tile::THREADS_PER_WARP;
+
+    // Only need to care about warp_n, because if warps_m > 1, both of them should load
+    // the same data
+    int const warp = (tidx & WARP_MASK_N) / WARP_DIV_N;
+
+    int read_row = (tidx & 0x04) / 4 + (tidx & 0x08) / 4 + warp * Base::ROWS_PER_WARP;
+    int read_col = (tidx & 0x03) * 2 + (tidx & 0x10) / 16;
+    read_col ^= (read_row & 0x01);
+
+    this->offset = read_row * Base::BYTES_PER_ROW + read_col * Base::BYTES_PER_LDS;
+  }
+
+  inline __device__ void load(Fragment (&frag)[Mma_tile::MMAS_N], int ki) {
+    int slice = ki / 2;
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ni++) {
+      uint32_t read_ptr = this->smem_ + this->offset + slice * Base::ROWS * Base::BYTES_PER_ROW / 2;
+      uint2 data;
+      ldsm_with_lds(data,
+                    read_ptr + ni * Base::ROWS_PER_WARP * Cta_tile::WARPS_N * Base::BYTES_PER_ROW);
+      static_assert(Fragment ::NUM_REGS == 2, "");
+      frag[ni].reg(0) = data.x;
+      frag[ni].reg(1) = data.y;
+    }
+    this->offset ^= 16;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_qk_interleaved_b<fmha::Turing_imma_int8_int32_traits, Cta_tile>
+    : public Smem_tile_qk_interleaved_b_base<Cta_tile, fmha::Turing_imma_int8_int32_traits> {
+  using Traits = fmha::Turing_imma_int8_int32_traits;
+  using Base = Smem_tile_qk_interleaved_b_base<Cta_tile, Traits>;
+
+  using Mma_tile = typename Base::Mma_tile;
+  using Fragment = typename Base::Fragment;
+
+  inline __device__ Smem_tile_qk_interleaved_b(char* smem, int tidx) : Base(smem, tidx) {
+    constexpr int WARPS_M = Cta_tile::WARPS_M;
+    constexpr int WARPS_N = Cta_tile::WARPS_N;
+    constexpr int WARPS_K = Cta_tile::WARPS_K;
+    constexpr int WARP_MASK_N = fmha::Warp_masks<WARPS_M, WARPS_N, WARPS_K>::N;
+    constexpr int WARP_DIV_N = WARPS_M * 1 * Cta_tile::THREADS_PER_WARP;
+
+    // Only need to care about warp_n, because if warps_m > 1, both of them should load
+    // the same data
+    int const warp = (tidx & WARP_MASK_N) / WARP_DIV_N;
+
+    int read_row = (tidx & 0x04) / 4 + (tidx & 0x08) / 4 + warp * Base::ROWS_PER_WARP;
+    int read_col = (tidx & 0x03) * 2 + (tidx & 0x10) / 16;
+    read_col ^= (read_row & 0x01);
+
+    this->offset = read_row * Base::BYTES_PER_ROW + read_col * Base::BYTES_PER_LDS;
+  }
+
+  inline __device__ void load(Fragment (&frag)[Mma_tile::MMAS_N], int ki) {
+    int slice = ki / 2;
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ni++) {
+      uint32_t read_ptr = this->smem_ + this->offset + slice * Base::ROWS * Base::BYTES_PER_ROW / 2;
+      uint2 data;
+      fmha::ldsm(data,
+                 read_ptr + ni * Base::ROWS_PER_WARP * Cta_tile::WARPS_N * Base::BYTES_PER_ROW);
+      static_assert(Fragment ::NUM_REGS == 2, "");
+      frag[ni].reg(0) = data.x;
+      frag[ni].reg(1) = data.y;
+    }
+    this->offset ^= 16;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_qk_interleaved_b<fmha::Ampere_imma_int8_int32_traits, Cta_tile>
+    : public Smem_tile_qk_interleaved_b_base<Cta_tile, fmha::Ampere_imma_int8_int32_traits> {
+  using Traits = fmha::Ampere_imma_int8_int32_traits;
+  using Base = Smem_tile_qk_interleaved_b_base<Cta_tile, Traits>;
+
+  using Mma_tile = typename Base::Mma_tile;
+  using Fragment = typename Base::Fragment;
+
+  inline __device__ Smem_tile_qk_interleaved_b(char* smem, int tidx) : Base(smem, tidx) {}
+
+  inline __device__ void load(Fragment (&frag)[Mma_tile::MMAS_N], int ki) {
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ni++) {
+      uint32_t read_ptr = this->smem_ + this->offset + ki * Base::ROWS * Base::BYTES_PER_ROW / 2;
+      uint4 data;
+      fmha::ldsm(data,
+                 read_ptr + ni * Base::ROWS_PER_WARP * Cta_tile::WARPS_N * Base::BYTES_PER_ROW);
+      static_assert(Fragment ::NUM_REGS == 4, "");
+      frag[ni].reg(0) = data.x;
+      frag[ni].reg(1) = data.y;
+      frag[ni].reg(2) = data.z;
+      frag[ni].reg(3) = data.w;
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Traits_>
+struct Smem_tile_v_interleaved_b_base
+    : public Smem_tile_qkv_interleaved<Cta_tile,  // BMM2: M = STEP, N = d, K = s
+                                       Traits_,
+                                       Cta_tile::K * 2,  // ROWS: K is the sequence length
+                                       Cta_tile::N / 2   // COLS: N is the head dimension
+                                       > {
+  using Base = Smem_tile_qkv_interleaved<Cta_tile, Traits_,
+                                         Cta_tile::K * 2,  // ROWS
+                                         Cta_tile::N / 2   // COLS
+                                         >;
+
+  using Mma_tile = typename Base::Mma_tile;
+  // TODO Row or col?
+  using Fragment = typename Base::Fragment_b;
+
+  inline __device__ Smem_tile_v_interleaved_b_base(char* smem, int tidx) : Base(smem, tidx) {
+    // // DEBUG.
+    // static_assert( Cta_tile::N == 64, "" );
+    // // END OF DEBUG.
+
+    constexpr int WARPS_M = Cta_tile::WARPS_M;
+    constexpr int WARPS_N = Cta_tile::WARPS_N;
+    constexpr int WARPS_K = Cta_tile::WARPS_K;
+
+    // 2x2: 2,2,1 then 2,1,2
+    // 1x4: 1,4,1 then 1,1,4
+    static_assert(WARPS_N == 1, "");
+
+    // Don't need to consider WARP M. For two warps in M, both would read the same tile
+    constexpr int WARP_MASK_K = fmha::Warp_masks<WARPS_M, WARPS_N, WARPS_K>::K;
+    constexpr int WARP_DIV_K = WARPS_M * WARPS_N * Cta_tile::THREADS_PER_WARP;
+
+    // the static assert above ensures, that only warp_m or warp_k is non-zero
+    int const warp = (tidx & WARP_MASK_K) / WARP_DIV_K;
+
+    /* LDSM.T addresses: warps are split in two to match BMM1-GEMM-N (= BMM2-GEMM-K) register
+     * layout
+     *    <== GEMM-N = D = 64 ==>
+     * [ 0:  0  0  1  0  2  0  3  0]  WARP 0
+     * [ 1:  0  4  0  5  0  6  0  7]
+     * [ 2:  8  0  9  0 10  0 11  0]
+     * [ 3:  0 12  0 13  0 14  0 15]
+     * [ 4:  0  0  0  0  0  0  0  0]  WARP 1
+     * [ 5:  0  0  0  0  0  0  0  0]
+     * [ 6:  0  0  0  0  0  0  0  0]
+     * [ 7:  0  0  0  0  0  0  0  0]
+     * [ 8:  0  0  0  0  0  0  0  0]  WARP 2
+     * [ 9:  0  0  0  0  0  0  0  0]
+     * [10:  0  0  0  0  0  0  0  0]
+     * [11:  0  0  0  0  0  0  0  0]
+     * [12:  0  0  0  0  0  0  0  0]  WARP 3
+     * [13:  0  0  0  0  0  0  0  0]
+     * [14:  0  0  0  0  0  0  0  0]
+     * [15:  0  0  0  0  0  0  0  0]
+     * [16: 16  0 17  0 18  0 19  0]  WARP 0
+     * [17:  0 20  0 21  0 22  0 23]
+     * [18: 24  0 25  0 26  0 27  0]
+     * [19:  0 28  0 29  0 30  0 31]
+     * etc ...
+     */
+
+    // TODO this is a bit misleading, as 4 rows per warp applies to the
+    // row-major tiles above. In this smem tile, a warp actually owns 8 rows in
+    // SMEM, but we have 4 rows per slice
+
+    // TODO would be good to rename to SMEM_ROWS_PER_WARP to make this clearer
+    static_assert(Base::ROWS_PER_WARP == 4, "");
+
+    read_row = ((tidx & 0x0f) / 4) + warp * Base::ROWS_PER_WARP;
+    read_col = (tidx & 0x03) * 2;
+    read_col ^= (read_row & 0x01);
+
+    // this->offset = read_row * Base::BYTES_PER_ROW + read_col * Base::BYTES_PER_LDS;
+  }
+
+  int read_row;
+  int read_col;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits_, typename Cta_tile>
+struct Smem_tile_v_interleaved_b {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_v_interleaved_b<fmha::Volta_imma_int8_int32_traits, Cta_tile>
+    : public Smem_tile_v_interleaved_b_base<Cta_tile, fmha::Volta_imma_int8_int32_traits> {
+  using Traits = fmha::Volta_imma_int8_int32_traits;
+  using Base = Smem_tile_v_interleaved_b_base<Cta_tile, Traits>;
+  using Mma_tile = typename Base::Mma_tile;
+  using Fragment = typename Base::Fragment_b;
+
+  // Ctor.
+  inline __device__ Smem_tile_v_interleaved_b(char* smem, int tidx) : Base(smem, tidx) {}
+
+  // Load fragments from shared memory.
+  inline __device__ void load(Fragment (&frag)[Mma_tile::MMAS_N], int ki) {
+    // static_assert(Mma_tile::MMAS_K == 4, "");
+    static_assert(Mma_tile::MMAS_N == 4, "");
+    static_assert(Base::ROWS_PER_WARP == 4, "");
+    // static_assert(Cta_tile::WARPS_K == 2, "");
+
+    int offset_k = ki * Cta_tile::WARPS_K * Base::ROWS_PER_WARP;
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ni++) {
+      uint32_t offset = (this->read_row + offset_k) * Base::BYTES_PER_ROW +
+                        (this->read_col ^ (ni & 1)) * Base::BYTES_PER_LDS;
+
+      // for the next 32B in N, we have to jump down K rows, so K / 4 rows in
+      // smem, which stores 4 canonical 32B rows per 128B
+      offset += (ni / 2) * Cta_tile::K / 4 * Base::BYTES_PER_ROW;
+      uint32_t read_ptr = this->smem_ + offset;  // + ki * Base::ROWS * Base::BYTES_PER_ROW / 2;
+      uint2 data = {0, 0};
+      ldsmt_with_lds(data, read_ptr);
+      static_assert(Fragment ::NUM_REGS == 2, "");
+      swizzle_rows(frag[ni].reg(0), frag[ni].reg(1), data.x, data.y);
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_v_interleaved_b<fmha::Turing_imma_int8_int32_traits, Cta_tile>
+    : public Smem_tile_v_interleaved_b_base<Cta_tile, fmha::Turing_imma_int8_int32_traits> {
+  using Traits = fmha::Turing_imma_int8_int32_traits;
+  using Base = Smem_tile_v_interleaved_b_base<Cta_tile, Traits>;
+  using Mma_tile = typename Base::Mma_tile;
+  using Fragment = typename Base::Fragment_b;
+
+  // Ctor.
+  inline __device__ Smem_tile_v_interleaved_b(char* smem, int tidx) : Base(smem, tidx) {}
+
+  // Load fragments from shared memory.
+  inline __device__ void load(Fragment (&frag)[Mma_tile::MMAS_N], int ki) {
+    static_assert(Mma_tile::MMAS_N == 4, "");
+    static_assert(Base::ROWS_PER_WARP == 4, "");
+
+    int offset_k = ki * Cta_tile::WARPS_K * Base::ROWS_PER_WARP;
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ni++) {
+      uint32_t offset = (this->read_row + offset_k) * Base::BYTES_PER_ROW +
+                        (this->read_col ^ (ni & 1)) * Base::BYTES_PER_LDS;
+      // for the next 32B in N, we have to jump down K rows, so K / 4 rows in
+      // smem, which stores 4 canonical 32B rows per 128B
+      offset += (ni / 2) * Cta_tile::K / 4 * Base::BYTES_PER_ROW;
+      uint32_t read_ptr = this->smem_ + offset;  // + ki * Base::ROWS * Base::BYTES_PER_ROW / 2;
+      uint2 data = {0, 0};
+      fmha::ldsmt(data, read_ptr);
+      static_assert(Fragment ::NUM_REGS == 2, "");
+      swizzle_rows(frag[ni].reg(0), frag[ni].reg(1), data.x, data.y);
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_v_interleaved_b<fmha::Ampere_imma_int8_int32_traits, Cta_tile>
+    : public Smem_tile_v_interleaved_b_base<Cta_tile, fmha::Ampere_imma_int8_int32_traits> {
+  // The instruction traits.
+  using Traits = fmha::Ampere_imma_int8_int32_traits;
+  // The base class.
+  using Base = Smem_tile_v_interleaved_b_base<Cta_tile, Traits>;
+  // The tile of MMAs.
+  using Mma_tile = typename Base::Mma_tile;
+  // The fragment loaded.
+  using Fragment = typename Base::Fragment_b;
+
+  // Ctor.
+  inline __device__ Smem_tile_v_interleaved_b(char* smem, int tidx) : Base(smem, tidx) {}
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&frag)[Mma_tile::MMAS_N], int ki) {
+    int offset_k = ki * Cta_tile::WARPS_K * Base::ROWS_PER_WARP * 2;
+    static_assert(Cta_tile::K != 192 || Mma_tile::MMAS_K == 2, "");
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ni++) {
+      uint32_t offset = (this->read_row + offset_k) * Base::BYTES_PER_ROW +
+                        (this->read_col ^ (ni & 1)) * Base::BYTES_PER_LDS;
+
+      // For the next 32B in N, we have to jump down K rows, so K / 4 rows in smem, which
+      // stores 4 canonical 32B rows per 128B.
+      offset += (ni / 2) * Cta_tile::K / 4 * Base::BYTES_PER_ROW;
+      uint32_t read_ptr = this->smem_ + offset;  // + ki * Base::ROWS * Base::BYTES_PER_ROW / 2;
+      uint2 data0 = {0, 0};
+      uint2 data1 = {0, 0};
+      fmha::ldsmt(data0, read_ptr);
+
+      if (Cta_tile::K != 192 || ki == 0) {
+        static_assert(Cta_tile::K != 192 || Mma_tile::MMAS_K == 2);
+        // For 192, with 4 warps, we need 128 rows of K, so for the second ldsm, we need
+        // only 2x instead of 4x.
+        int imm = Cta_tile::WARPS_K * Base::ROWS_PER_WARP * Base::BYTES_PER_ROW;
+        fmha::ldsmt(data1, read_ptr + imm);
+      }
+
+      static_assert(Fragment ::NUM_REGS == 4, "");
+      swizzle_rows(frag[ni].reg(0), frag[ni].reg(2), data0.x, data0.y);
+      swizzle_rows(frag[ni].reg(1), frag[ni].reg(3), data1.x, data1.y);
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/smem_tile_v.h b/csrc/fmha_v2/fmha/smem_tile_v.h
new file mode 100644
index 0000000000..67a02f37ca
--- /dev/null
+++ b/csrc/fmha_v2/fmha/smem_tile_v.h
@@ -0,0 +1,1008 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/fragment.h>
+#include <fmha/hopper/gmma_descriptor.h>
+#include <fmha/smem_tile.h>
+#include <fmha/traits.h>
+#include <fmha/utils.h>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int BUFFERS_PER_TILE = 1,
+          fmha::Gmma_descriptor_mode desc_mode = fmha::Gmma_descriptor_mode::SWIZZLE_NONE,
+          bool USE_TMA = false>
+struct Smem_tile_v {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, template <class, int> class Rows_per_xor_pattern,
+          int BUFFERS_PER_TILE = 1>
+struct Smem_tile_v_hmma {
+  using Base = Smem_tile_without_skews<Cta_tile, Cta_tile::K, Cta_tile::N, 16, 16, BUFFERS_PER_TILE,
+                                       0, Rows_per_xor_pattern<Traits, Cta_tile::N>::VALUE, 1>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int BUFFERS_PER_TILE>
+struct Smem_tile_v<Volta_hmma_fp16_16x16x16_traits, Cta_tile, BUFFERS_PER_TILE>
+    : public Smem_tile_v_hmma<Volta_hmma_fp16_16x16x16_traits, Cta_tile,
+                              Rows_per_xor_pattern_volta_b, BUFFERS_PER_TILE>::Base {
+  // The traits class.
+  using Traits = fmha::Volta_hmma_fp16_16x16x16_traits;
+  // The base class.
+  using Base = typename Smem_tile_v_hmma<Traits, Cta_tile, Rows_per_xor_pattern_volta_b,
+                                         BUFFERS_PER_TILE>::Base;
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The fragment.
+  using Fragment = fmha::Fragment_b<Traits, fmha::Row>;
+
+  // The size of a single LDS in bytes.
+  enum { BYTES_PER_LDS = 16 };
+
+  // Ctor.
+  inline __device__ Smem_tile_v(void* smem, int tidx) : Base(smem, tidx) {
+    // Warps.
+    enum { WARPS_M = Cta_tile::WARPS_M };
+
+    enum { WARPS_N = Cta_tile::WARPS_N };
+
+    enum { WARPS_K = Cta_tile::WARPS_K };
+
+    // Determine the config.
+    enum { WARPS_2x1x2 = WARPS_M == 2 && WARPS_N == 1 && WARPS_K == 2 };
+
+    enum { WARPS_1x1x8 = WARPS_M == 1 && WARPS_N == 1 && WARPS_K == 8 };
+
+    enum { WARPS_1x1x4 = WARPS_M == 1 && WARPS_N == 1 && WARPS_K == 4 };
+
+    // Flash Attention uses WARPS_4x1x1
+    enum { WARPS_4x1x1 = WARPS_M == 4 && WARPS_N == 1 && WARPS_K == 1 };
+
+    // The row/col read by the thread.
+    int read_row, read_col;
+
+    // SEQLEN == 128 and N == 16.
+    if (WARPS_2x1x2 && Cta_tile::N == 16) {
+      read_row = (tidx & 0x40) / 16 + (tidx & 0x08) / 8;
+      read_col = (tidx & 0x10) / 16 + (tidx & 0x03) * 2;
+
+      // SEQLEN == 128 and N == 32.
+    } else if (WARPS_2x1x2 && Cta_tile::N == 32) {
+      read_row = (tidx & 0x40) / 8 + (tidx & 0x08) / 4 + (tidx & 0x02) / 2;
+      read_col = (tidx & 0x10) / 16 + (tidx & 0x01) * 4;
+
+      // SEQLEN == 128 and N == 64.
+    } else if (WARPS_2x1x2 && Cta_tile::N == 64) {
+      read_row = (tidx & 0x40) / 4 + (tidx & 0x08) / 2 + (tidx & 0x03);
+      read_col = (tidx & 0x10) / 16;
+
+      // SEQLEN == 256, 512 and N == 16.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 16) {
+      read_row = (tidx & 0xe0) / 8 + (tidx & 0x08) / 8;
+      read_col = (tidx & 0x10) / 16 + (tidx & 0x03) * 2;
+
+      // SEQLEN == 256, 512 and N == 32.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 32) {
+      read_row = (tidx & 0xe0) / 4 + (tidx & 0x08) / 4 + (tidx & 0x02) / 2;
+      read_col = (tidx & 0x10) / 16 + (tidx & 0x01) * 4;
+
+      // SEQLEN == 256, 384 and 512 and N == 64.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) &&
+               (Cta_tile::N == 64 || Cta_tile::N == 128 || Cta_tile::N == 256)) {
+      read_row = (tidx & 0xe0) / 2 + (tidx & 0x08) / 2 + (tidx & 0x03);
+      read_col = (tidx & 0x10) / 16;
+
+      // ANY SEQLEN and N == 16.
+    } else if (WARPS_4x1x1 && Cta_tile::N == 16) {
+      read_row = (tidx & 0x08) / 8;
+      read_col = (tidx & 0x10) / 16 + (tidx & 0x03) * 2;
+
+      // ANY SEQLEN and N == 32.
+    } else if (WARPS_4x1x1 && Cta_tile::N == 32) {
+      read_row = (tidx & 0x08) / 4 + (tidx & 0x02) / 2;
+      read_col = (tidx & 0x10) / 16 + (tidx & 0x01) * 4;
+
+      // ANY SEQLEN and N == 64/128/256.
+    } else if (WARPS_4x1x1 && (Cta_tile::N == 64 || Cta_tile::N == 128 || Cta_tile::N == 256)) {
+      read_row = (tidx & 0x08) / 2 + (tidx & 0x03);
+      read_col = (tidx & 0x10) / 16;
+
+      // Not supported!
+    } else {
+      assert(false);
+    }
+
+    // Apply the XOR for the column.
+    read_col ^= read_row % Base::ROWS_PER_XOR_PATTERN;
+
+    // The shared memory offset.
+    this->smem_read_offset_ = read_row * Base::BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+  }
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&b)[Mma_tile::VALID_MMAS_N], int ki) {
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      // The column offset.
+      int offset = this->smem_read_offset_ ^ (ni * 2 * BYTES_PER_LDS);
+
+      // Skip N paddings
+      if (ni < Mma_tile::VALID_MMAS_N) {
+        // The rows.
+        int row_0 = ki * 16 * Cta_tile::WARPS_K + 0;
+        int row_1 = ki * 16 * Cta_tile::WARPS_K + 8;
+
+        // Load the data using 2x LDS.128.
+        uint4 tmp;
+        fmha::lds(tmp, this->smem_ + offset + row_0 * Base::BYTES_PER_ROW_BEFORE_PACKING);
+        b[ni].reg(0) = tmp.x;
+        b[ni].reg(1) = tmp.y;
+        b[ni].reg(2) = tmp.z;
+        b[ni].reg(3) = tmp.w;
+
+        fmha::lds(tmp, this->smem_ + offset + row_1 * Base::BYTES_PER_ROW_BEFORE_PACKING);
+        b[ni].reg(4) = tmp.x;
+        b[ni].reg(5) = tmp.y;
+        b[ni].reg(6) = tmp.z;
+        b[ni].reg(7) = tmp.w;
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int BUFFERS_PER_TILE>
+struct Smem_tile_v_turing_hmma
+    : public Smem_tile_v_hmma<Traits, Cta_tile, Rows_per_xor_pattern_turing_b,
+                              BUFFERS_PER_TILE>::Base {
+  // The base class.
+  using Base = typename Smem_tile_v_hmma<Traits, Cta_tile, Rows_per_xor_pattern_turing_b,
+                                         BUFFERS_PER_TILE>::Base;
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The fragment.
+  using Fragment = fmha::Fragment_b<Traits, fmha::Col>;
+
+  // The size of a single LDS in bytes.
+  enum { BYTES_PER_LDS = 16 };
+
+  // Ctor.
+  inline __device__ Smem_tile_v_turing_hmma(void* smem, int tidx) : Base(smem, tidx) {
+    // Warps.
+    enum { WARPS_M = Cta_tile::WARPS_M };
+
+    enum { WARPS_N = Cta_tile::WARPS_N };
+
+    enum { WARPS_K = Cta_tile::WARPS_K };
+
+    // Determine the config.
+    enum { WARPS_2x1x2 = WARPS_M == 2 && WARPS_N == 1 && WARPS_K == 2 };
+
+    enum { WARPS_1x1x8 = WARPS_M == 1 && WARPS_N == 1 && WARPS_K == 8 };
+
+    enum { WARPS_1x1x4 = WARPS_M == 1 && WARPS_N == 1 && WARPS_K == 4 };
+
+    // Flash Attention uses WARPS_4x1x1
+    enum { WARPS_4x1x1 = WARPS_M == 4 && WARPS_N == 1 && WARPS_K == 1 };
+
+    // The row/col read by the thread.
+    int read_row, read_col;
+
+    // SEQLEN == 128 and N == 16.
+    if (WARPS_2x1x2 && Cta_tile::N == 16) {
+      read_row = (tidx & 0x40) / 16 + (tidx & 0x04) / 4;
+      read_col = (tidx & 0x03) * 2 + (tidx & 0x04) / 4;
+
+      // SEQLEN == 128 and N == 32.
+    } else if (WARPS_2x1x2 && Cta_tile::N == 32) {
+      read_row = (tidx & 0x40) / 8 + (tidx & 0x06) / 2;
+      read_col = (tidx & 0x01) * 4 + (tidx & 0x06) / 2;
+
+      // SEQLEN == 128 and N == 64.
+    } else if (WARPS_2x1x2 && Cta_tile::N == 64) {
+      read_row = (tidx & 0x40) / 4 + (tidx & 0x07);
+      read_col = (tidx & 0x07);
+
+      // SEQLEN == 256, 512 and N == 16.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 16) {
+      read_row = (tidx & 0xe0) / 8 + (tidx & 0x04) / 4;
+      read_col = (tidx & 0x03) * 2 + (tidx & 0x04) / 4;
+
+      // SEQLEN == 256, 512 and N == 32.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 32) {
+      read_row = (tidx & 0xe0) / 4 + (tidx & 0x06) / 2;
+      read_col = (tidx & 0x01) * 4 + (tidx & 0x06) / 2;
+
+      // SEQLEN == 256, 384, 512 and N == 64, 128, 256.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) &&
+               (Cta_tile::N == 64 || Cta_tile::N == 128 || Cta_tile::N == 256)) {
+      read_row = (tidx & 0xe0) / 2 + (tidx & 0x07);
+      read_col = (tidx & 0x07);
+
+      // ANY SEQLEN and N == 16.
+    } else if (WARPS_4x1x1 && Cta_tile::N == 16) {
+      read_row = (tidx & 0x04) / 4;
+      read_col = (tidx & 0x03) * 2 + (tidx & 0x04) / 4;
+
+      // ANY SEQLEN and N == 32.
+    } else if (WARPS_4x1x1 && Cta_tile::N == 32) {
+      read_row = (tidx & 0x06) / 2;
+      read_col = (tidx & 0x01) * 4 + (tidx & 0x06) / 2;
+
+      // ANY SEQLEN and N == 64/128/256.
+    } else if ((WARPS_4x1x1) && (Cta_tile::N == 64 || Cta_tile::N == 128 || Cta_tile::N == 256)) {
+      read_row = (tidx & 0x07);
+      read_col = (tidx & 0x07);
+
+      // Not supported!
+    } else {
+      assert(false);
+    }
+
+    // The 2nd HMMA.
+    read_col ^= (tidx & 0x08) / 8;
+
+    // The shared memory offset.
+    this->smem_read_offset_ = read_row * Base::BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+  }
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&b)[Mma_tile::VALID_MMAS_N], int ki) {
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      // The amount of row packing.
+      enum { ROW_PACKING = Base::BYTES_PER_ROW / Base::BYTES_PER_ROW_BEFORE_PACKING };
+
+      // Skip N paddings
+      if (ni < Mma_tile::VALID_MMAS_N) {
+        // For even values of k value we jump by 16*WARPS_K rows and for odd, we jump by 8 rows.
+        int row = (ki / 2) * 16 * Cta_tile::WARPS_K / ROW_PACKING + (ki % 2) * 8 / ROW_PACKING;
+
+        // Load the data using LDSM.MT88.2.
+        uint2 tmp;
+        fmha::ldsmt(tmp, this->smem_ + this->smem_read_offset_ + row * Base::BYTES_PER_ROW);
+        b[ni].reg(0) = tmp.x;
+        b[ni].reg(1) = tmp.y;
+      }
+
+      // Move to the next N position.
+      if (Mma_tile::MMAS_N == 1) {
+        ;
+      } else if (Mma_tile::MMAS_N == 2) {
+        this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
+      } else if (Mma_tile::MMAS_N == 4) {
+        this->smem_read_offset_ ^= BYTES_PER_LDS * (ni % 2 == 0 ? 2 : 6);
+      } else if (Mma_tile::MMAS_N == 8) {
+        this->smem_read_offset_ ^= BYTES_PER_LDS * ((ni & 1) == 0 ? 2 : ((ni & 3) == 3 ? 14 : 6));
+      } else if (Mma_tile::MMAS_N == 16) {
+        this->smem_read_offset_ ^= BYTES_PER_LDS * ((ni & 1) == 0     ? 2
+                                                    : ((ni & 7) == 7) ? 30
+                                                                      : (((ni & 3) == 3) ? 14 : 6));
+      } else {
+        assert(false);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int BUFFERS_PER_TILE>
+struct Smem_tile_v<Turing_hmma_fp16_traits, Cta_tile, BUFFERS_PER_TILE>
+    : public Smem_tile_v_turing_hmma<Turing_hmma_fp16_traits, Cta_tile, BUFFERS_PER_TILE> {
+  // The base class.
+  using Base = Smem_tile_v_turing_hmma<Turing_hmma_fp16_traits, Cta_tile, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_v(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int BUFFERS_PER_TILE>
+struct Smem_tile_v<Turing_hmma_fp32_traits, Cta_tile, BUFFERS_PER_TILE>
+    : public Smem_tile_v_turing_hmma<Turing_hmma_fp32_traits, Cta_tile, BUFFERS_PER_TILE> {
+  // The base class.
+  using Base = Smem_tile_v_turing_hmma<Turing_hmma_fp32_traits, Cta_tile, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_v(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, template <class, int> class Rows_per_xor_pattern,
+          int BUFFERS_PER_TILE = 1>
+struct Smem_tile_v_imma {
+  using Base = Smem_tile_without_skews<Cta_tile, Cta_tile::K, Cta_tile::N, 8, 16, BUFFERS_PER_TILE,
+                                       0, Rows_per_xor_pattern<Traits, Cta_tile::N>::VALUE, 1>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int BUFFERS_PER_TILE>
+struct Smem_tile_v<Volta_imma_int8_int32_traits, Cta_tile, BUFFERS_PER_TILE>
+    : public Smem_tile_v_imma<Volta_imma_int8_int32_traits, Cta_tile, Rows_per_xor_pattern_volta_b,
+                              BUFFERS_PER_TILE>::Base {
+  // The traits class.
+  using Traits = Volta_imma_int8_int32_traits;
+  // The base class.
+  using Base = typename Smem_tile_v_imma<Traits, Cta_tile, Rows_per_xor_pattern_volta_b,
+                                         BUFFERS_PER_TILE>::Base;
+
+  // DEBUG.
+  static_assert(Base::BYTES_PER_ROW / Base::BYTES_PER_ROW_BEFORE_PACKING == 2, "");
+  // END OF DEBUG.
+
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The fragment.
+  using Fragment = fmha::Fragment_b<Traits, fmha::Col>;
+
+  // The size of a single LDS in bytes.
+  enum { BYTES_PER_LDS = 16 };
+
+  // Ctor.
+  inline __device__ Smem_tile_v(void* smem, int tidx) : Base(smem, tidx) {
+    // The row/col read by the thread.
+    int read_row, read_col;
+
+    // Warps.
+    enum { WARPS_M = Cta_tile::WARPS_M };
+
+    enum { WARPS_N = Cta_tile::WARPS_N };
+
+    enum { WARPS_K = Cta_tile::WARPS_K };
+
+    // Determine the config.
+    enum { WARPS_2x1x2 = WARPS_M == 2 && WARPS_N == 1 && WARPS_K == 2 };
+
+    enum { WARPS_1x1x8 = WARPS_M == 1 && WARPS_N == 1 && WARPS_K == 8 };
+
+    enum { WARPS_1x1x4 = WARPS_M == 1 && WARPS_N == 1 && WARPS_K == 4 };
+
+    // SEQLEN == 128 and N == 16.
+    if (WARPS_2x1x2 && Cta_tile::N == 16) {
+      read_row = (tidx & 0x40) / 32 + (tidx & 0x08) / 8;
+      read_col = (tidx & 0x07);
+
+      // SEQLEN == 128 and N == 32.
+    } else if (WARPS_2x1x2 && Cta_tile::N == 32) {
+      read_row = (tidx & 0x40) / 16 + (tidx & 0x0c) / 4;
+      read_col = (tidx & 0x03) * 2 + (tidx & 0x04) / 4;
+
+      // SEQLEN == 128 and N == 64.
+    } else if (WARPS_2x1x2 && Cta_tile::N == 64) {
+      read_row = (tidx & 0x40) / 8 + (tidx & 0x0e) / 2;
+      read_col = (tidx & 0x01) * 4 + (tidx & 0x06) / 2;
+
+      // SEQLEN == 256, 512 and N == 16.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 16) {
+      read_row = (tidx & 0xe0) / 16 + (tidx & 0x08) / 8;
+      read_col = (tidx & 0x07);
+
+      // SEQLEN == 256, 512 and N == 32.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 32) {
+      read_row = (tidx & 0xe0) / 8 + (tidx & 0x0c) / 4;
+      read_col = (tidx & 0x03) * 2 + (tidx & 0x04) / 4;
+
+      // SEQLEN == 256, 384, 512 and N == 64.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 64) {
+      read_row = (tidx & 0xe0) / 4 + (tidx & 0x0e) / 2;
+      read_col = (tidx & 0x01) * 4 + (tidx & 0x06) / 2;
+
+      // Not supported.
+    } else {
+      assert(false);
+    }
+
+    // The shared memory offset.
+    this->smem_read_offset_ = read_row * Base::BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+  }
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&b)[Mma_tile::VALID_MMAS_N], int ki) {
+    static_assert(Mma_tile::MMAS_K == 2 || Mma_tile::MMAS_K == 3 || Mma_tile::MMAS_K == 4 ||
+                      Mma_tile::MMAS_K == 6,
+                  "");
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      // The amount of row packing.
+      enum { ROW_PACKING = Base::BYTES_PER_ROW / Base::BYTES_PER_ROW_BEFORE_PACKING };
+
+      // Skip N paddings
+      if (ni < Mma_tile::VALID_MMAS_N) {
+        // Jump by 8*16 rows per K but account for packing.
+        int row = ki * 16 * Cta_tile::WARPS_K / ROW_PACKING;
+
+        // We emulate the Turing logic, which loads the data using LDSM.MT88.2:
+        // uint2 tmp;
+        // fmha::ldsmt(tmp, this->smem_ + this->smem_read_offset_ + row * Base::BYTES_PER_ROW);
+        // this call fetches two 8x16 matrices, stacked on top of each other
+
+        // we fake LDSM.MT88.2, with 2 LDS.128 and a shuffle:
+        // - T 0 - T 7 have the smem addresses of LDSM 0, each should do 16B loads
+        // - T 8 - T15 have the smem addresses of LSDM 1, each should do 16B loads
+        int const lane = threadIdx.x % Cta_tile::THREADS_PER_WARP;
+
+        uint4 tmp16{0, 0, 0, 0};  // 16B
+
+        if (lane < 16) {
+          fmha::lds(tmp16, this->smem_ + this->smem_read_offset_ + row * Base::BYTES_PER_ROW);
+        }
+
+        uint16_t* tmp16c = reinterpret_cast<uint16_t*>(&tmp16);  // 8x2B: we move pairs
+
+        uint2 tmp;                                        // 2*4B
+        uint16_t* t = reinterpret_cast<uint16_t*>(&tmp);  // 4x2B
+
+        int const src_col = lane / 4;  // 0 - 7
+        int const src_row = lane % 4 * 2;
+
+// We have to shuffle the values to distribute them in the warp.
+#pragma unroll
+        for (int it = 0; it < 8; it++) {
+          uint16_t val, x, y;
+          val = tmp16c[it];
+          x = __shfl_sync(uint32_t(-1), val, src_row + 0);
+          __syncwarp();
+          y = __shfl_sync(uint32_t(-1), val, src_row + 1);
+          __syncwarp();
+
+          if (src_col == it) {
+            t[0] = x;
+            t[1] = y;
+          }
+          val = tmp16c[it];
+          x = __shfl_sync(uint32_t(-1), val, src_row + 8);
+          __syncwarp();
+          y = __shfl_sync(uint32_t(-1), val, src_row + 9);
+          __syncwarp();
+
+          if (src_col == it) {
+            t[2] = x;
+            t[3] = y;
+          }
+        }
+
+        // Repack the elements. With LDSM.T, thread 0 has the following elements in its two
+        // regs:
+        //
+        //   R0 = [(n=0 k=0), (n=1 k=0), (n=0 k=8), (n=1 k=8)]
+        //   R1 = [(n=0 k=1), (n=1 k=1), (n=0 k=9), (n=1 k=9)]
+        //
+        // We want to repack the values as:
+        //
+        //   R0 = [(n=0 k=0), (n=0 k=1), (n=0 k=8), (n=0 k=9)]
+        //   R1 = [(n=1 k=0), (n=1 k=1), (n=1 k=8), (n=1 k=9)]
+        //
+        // Since that this layout corresponds to the layout of elements in the Fragment_a from
+        // P.
+
+        swizzle_rows(b[ni].reg(0), b[ni].reg(1), tmp.x, tmp.y);
+      }
+
+      // Move to the next N position.
+      if (Mma_tile::MMAS_N == 4) {
+        this->smem_read_offset_ ^= BYTES_PER_LDS * (ni % 2 == 0 ? 1 : 3);
+      } else {
+        assert(false);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int BUFFERS_PER_TILE>
+struct Smem_tile_v<Turing_imma_int8_int32_traits, Cta_tile, BUFFERS_PER_TILE>
+    : public Smem_tile_v_imma<Turing_imma_int8_int32_traits, Cta_tile,
+                              Rows_per_xor_pattern_turing_b, BUFFERS_PER_TILE>::Base {
+  // The traits class.
+  using Traits = Turing_imma_int8_int32_traits;
+  // The base class.
+  using Base = typename Smem_tile_v_imma<Traits, Cta_tile, Rows_per_xor_pattern_turing_b,
+                                         BUFFERS_PER_TILE>::Base;
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The fragment.
+  using Fragment = fmha::Fragment_b<Traits, fmha::Col>;
+
+  // The size of a single LDS in bytes.
+  enum { BYTES_PER_LDS = 16 };
+
+  // Ctor.
+  inline __device__ Smem_tile_v(void* smem, int tidx) : Base(smem, tidx) {
+    // Warps.
+    enum { WARPS_M = Cta_tile::WARPS_M };
+
+    enum { WARPS_N = Cta_tile::WARPS_N };
+
+    enum { WARPS_K = Cta_tile::WARPS_K };
+
+    // Determine the config.
+    enum { WARPS_2x1x2 = WARPS_M == 2 && WARPS_N == 1 && WARPS_K == 2 };
+
+    enum { WARPS_1x1x8 = WARPS_M == 1 && WARPS_N == 1 && WARPS_K == 8 };
+
+    enum { WARPS_1x1x4 = WARPS_M == 1 && WARPS_N == 1 && WARPS_K == 4 };
+
+    // The row/col read by the thread.
+    int read_row, read_col;
+
+    // SEQLEN == 128 and N == 32.
+    if (WARPS_2x1x2 && Cta_tile::N == 16) {
+      read_row = (tidx & 0x40) / 32 + (tidx & 0x08) / 8;
+      read_col = (tidx & 0x07);
+
+      // SEQLEN == 128 and N == 32.
+    } else if (WARPS_2x1x2 && Cta_tile::N == 32) {
+      read_row = (tidx & 0x40) / 16 + (tidx & 0x0c) / 4;
+      read_col = (tidx & 0x03) * 2 + (tidx & 0x04) / 4;
+
+      // SEQLEN == 128 and N == 64.
+    } else if (WARPS_2x1x2 && Cta_tile::N == 64) {
+      read_row = (tidx & 0x40) / 8 + (tidx & 0x0e) / 2;
+      read_col = (tidx & 0x01) * 4 + (tidx & 0x06) / 2;
+
+      // SEQLEN == 256, 512 and N == 16.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 16) {
+      read_row = (tidx & 0xe0) / 16 + (tidx & 0x08) / 8;
+      read_col = (tidx & 0x07);
+
+      // SEQLEN == 256, 512 and N == 32.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 32) {
+      read_row = (tidx & 0xe0) / 8 + (tidx & 0x0c) / 4;
+      read_col = (tidx & 0x03) * 2 + (tidx & 0x04) / 4;
+
+      // SEQLEN == 256, 384, 512 and N == 64.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 64) {
+      read_row = (tidx & 0xe0) / 4 + (tidx & 0x0e) / 2;
+      read_col = (tidx & 0x01) * 4 + (tidx & 0x06) / 2;
+
+      // Not supported.
+    } else {
+      assert(false);
+    }
+
+    // The shared memory offset.
+    this->smem_read_offset_ = read_row * Base::BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+  }
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&b)[Mma_tile::VALID_MMAS_N], int ki) {
+    static_assert(Mma_tile::MMAS_K == 2 || Mma_tile::MMAS_K == 3 || Mma_tile::MMAS_K == 4 ||
+                      Mma_tile::MMAS_K == 6 || Mma_tile::MMAS_K == 8,
+                  "");
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      // The amount of row packing.
+      enum { ROW_PACKING = Base::BYTES_PER_ROW / Base::BYTES_PER_ROW_BEFORE_PACKING };
+
+      // Skip N paddings
+      if (ni < Mma_tile::VALID_MMAS_N) {
+        // Jump by 8*16 rows per K but account for packing.
+        int row = ki * 16 * Cta_tile::WARPS_K / ROW_PACKING;
+
+        // Load the data using LDSM.MT88.2.
+        uint2 tmp;
+        fmha::ldsmt(tmp, this->smem_ + this->smem_read_offset_ + row * Base::BYTES_PER_ROW);
+
+        // Repack the elements. With LDSM.T, thread 0 has the following elements in its two
+        // regs:
+        //
+        //   R0 = [(n=0 k=0), (n=1 k=0), (n=0 k=8), (n=1 k=8)]
+        //   R1 = [(n=0 k=1), (n=1 k=1), (n=0 k=9), (n=1 k=9)]
+        //
+        // We want to repack the values as:
+        //
+        //   R0 = [(n=0 k=0), (n=0 k=1), (n=0 k=8), (n=0 k=9)]
+        //   R1 = [(n=1 k=0), (n=1 k=1), (n=1 k=8), (n=1 k=9)]
+        //
+        // Since that this layout corresponds to the layout of elements in the Fragment_a from
+        // P.
+
+        swizzle_rows(b[ni].reg(0), b[ni].reg(1), tmp.x, tmp.y);
+
+        // b[ni].reg(0) = tmp.x;
+        // b[ni].reg(1)=  tmp.y;
+      }
+
+      // Move to the next N position.
+      if (Mma_tile::MMAS_N == 1) {
+        // Noop.
+      } else if (Mma_tile::MMAS_N == 2) {
+        this->smem_read_offset_ ^= BYTES_PER_LDS;
+      } else if (Mma_tile::MMAS_N == 4) {
+        this->smem_read_offset_ ^= BYTES_PER_LDS * (ni % 2 == 0 ? 1 : 3);
+      } else {
+        assert(false);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, int BUFFERS_PER_TILE>
+struct Smem_tile_v_ampere_hmma
+    : public Smem_tile_v_hmma<Traits, Cta_tile, Rows_per_xor_pattern_ampere_b,
+                              BUFFERS_PER_TILE>::Base {
+  // The base class.
+  using Base = typename Smem_tile_v_hmma<Traits, Cta_tile, Rows_per_xor_pattern_ampere_b,
+                                         BUFFERS_PER_TILE>::Base;
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The fragment.
+  using Fragment = fmha::Fragment_b<Traits, fmha::Col>;
+
+  // The size of a single LDS in bytes.
+  enum { BYTES_PER_LDS = 16 };
+
+  // Ctor.
+  inline __device__ Smem_tile_v_ampere_hmma(void* smem, int tidx) : Base(smem, tidx) {
+    // Warps.
+    enum { WARPS_M = Cta_tile::WARPS_M };
+
+    enum { WARPS_N = Cta_tile::WARPS_N };
+
+    enum { WARPS_K = Cta_tile::WARPS_K };
+
+    // Determine the config.
+    enum { WARPS_2x1x2 = WARPS_M == 2 && WARPS_N == 1 && WARPS_K == 2 };
+
+    enum { WARPS_1x1x8 = WARPS_M == 1 && WARPS_N == 1 && WARPS_K == 8 };
+
+    enum { WARPS_1x1x4 = WARPS_M == 1 && WARPS_N == 1 && WARPS_K == 4 };
+
+    // Flash Attention uses WARPS_4x1x1
+    enum { WARPS_4x1x1 = WARPS_M == 4 && WARPS_N == 1 && WARPS_K == 1 };
+
+    // The row/col read by the thread.
+    int read_row, read_col;
+
+    // SEQLEN == 128 and N == 16.
+    if (WARPS_2x1x2 && Cta_tile::N == 16) {
+      read_row = (tidx & 0x40) / 16 + (tidx & 0x0c) / 4;
+      read_col = (tidx & 0x03) * 2 + (tidx & 0x04) / 4;
+
+      // SEQLEN == 128 and N == 32.
+    } else if (WARPS_2x1x2 && Cta_tile::N == 32) {
+      read_row = (tidx & 0x40) / 8 + (tidx & 0x0e) / 2;
+      read_col = (tidx & 0x01) * 4 + (tidx & 0x06) / 2;
+
+      // SEQLEN == 128 and N == 64/128/256.
+    } else if (WARPS_2x1x2 && (Cta_tile::N == 64 || Cta_tile::N == 128 || Cta_tile::N == 256)) {
+      read_row = (tidx & 0x40) / 4 + (tidx & 0x0f);
+      read_col = (tidx & 0x07);
+
+      // SEQLEN == 256, 512 and N == 16.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 16) {
+      read_row = (tidx & 0xe0) / 8 + (tidx & 0x0c) / 4;
+      read_col = (tidx & 0x03) * 2 + (tidx & 0x04) / 4;
+
+      // SEQLEN == 256, 512 and N == 32.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 32) {
+      read_row = (tidx & 0xe0) / 4 + (tidx & 0x0e) / 2;
+      read_col = (tidx & 0x01) * 4 + (tidx & 0x06) / 2;
+
+      // SEQLEN == 256, 384, 512 and N == 64/128/256.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) &&
+               (Cta_tile::N == 64 || Cta_tile::N == 128 || Cta_tile::N == 256)) {
+      read_row = (tidx & 0xe0) / 2 + (tidx & 0x0f);
+      read_col = (tidx & 0x07);
+
+      // ANY SEQLEN and N == 16.
+    } else if (WARPS_4x1x1 && Cta_tile::N == 16) {
+      read_row = (tidx & 0x0c) / 4;
+      read_col = (tidx & 0x03) * 2 + (tidx & 0x04) / 4;
+
+      // ANY SEQLEN and N == 32.
+    } else if (WARPS_4x1x1 && Cta_tile::N == 32) {
+      read_row = (tidx & 0x0e) / 2;
+      read_col = (tidx & 0x01) * 4 + (tidx & 0x06) / 2;
+
+      // ANY SEQLEN and N == 64/128/256.
+    } else if (WARPS_4x1x1 && (Cta_tile::N == 64 || Cta_tile::N == 128 || Cta_tile::N == 256 ||
+                               Cta_tile::N == 512)) {
+      read_row = (tidx & 0x0f);
+      read_col = (tidx & 0x07);
+
+      // Not supported.
+    } else {
+      assert(false);
+    }
+
+    // The 2nd HMMA.
+    read_col ^= (tidx & 0x10) / 16;
+
+    // The shared memory offset.
+    this->smem_read_offset_ = read_row * Base::BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+  }
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&b)[Mma_tile::VALID_MMAS_N], int ki) {
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      // The amount of row packing.
+      enum { ROW_PACKING = Base::BYTES_PER_ROW / Base::BYTES_PER_ROW_BEFORE_PACKING };
+
+      // Jump by 16 * #warps row. Account for the packing.
+      int row = ki * 16 * Cta_tile::WARPS_K / ROW_PACKING;
+
+      // Skip N paddings
+      if (ni < Mma_tile::VALID_MMAS_N) {
+        // Jump by 16 * #warps row. Account for the packing.
+        int row = ki * 16 * Cta_tile::WARPS_K / ROW_PACKING;
+
+        // Load the data using LDSM.MT88.2.
+        uint4 tmp;
+        fmha::ldsmt(tmp, this->smem_ + this->smem_read_offset_ + this->smem_read_buffer_ +
+                             row * Base::BYTES_PER_ROW);
+        b[ni].reg(0) = tmp.x;
+        b[ni].reg(1) = tmp.y;
+        b[ni].reg(2) = tmp.z;
+        b[ni].reg(3) = tmp.w;
+      }
+
+      // Move the pointer for the next ni. I expect the compiler to not recompute those.
+      static_assert(Mma_tile::MMAS_N <= 64, "");
+      if (Mma_tile::MMAS_N >= 32 && ni % 16 == 15) {
+        this->smem_read_offset_ ^= 31 * BYTES_PER_LDS * 2;
+      } else if (Mma_tile::MMAS_N >= 16 && ni % 8 == 7) {
+        this->smem_read_offset_ ^= 15 * BYTES_PER_LDS * 2;
+      } else if (Mma_tile::MMAS_N >= 8 && ni % 4 == 3) {
+        this->smem_read_offset_ ^= 7 * BYTES_PER_LDS * 2;
+      } else if (Mma_tile::MMAS_N >= 4 && ni % 2 == 1) {
+        this->smem_read_offset_ ^= 3 * BYTES_PER_LDS * 2;
+      } else if (Mma_tile::MMAS_N >= 2) {
+        this->smem_read_offset_ ^= 1 * BYTES_PER_LDS * 2;
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int BUFFERS_PER_TILE>
+struct Smem_tile_v<Ampere_hmma_fp16_traits, Cta_tile, BUFFERS_PER_TILE>
+    : public Smem_tile_v_ampere_hmma<Ampere_hmma_fp16_traits, Cta_tile, BUFFERS_PER_TILE> {
+  // The base class.
+  using Base = Smem_tile_v_ampere_hmma<Ampere_hmma_fp16_traits, Cta_tile, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_v(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int BUFFERS_PER_TILE>
+struct Smem_tile_v<Ampere_hmma_fp32_traits, Cta_tile, BUFFERS_PER_TILE>
+    : public Smem_tile_v_ampere_hmma<Ampere_hmma_fp32_traits, Cta_tile, BUFFERS_PER_TILE> {
+  // The base class.
+  using Base = Smem_tile_v_ampere_hmma<Ampere_hmma_fp32_traits, Cta_tile, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_v(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int BUFFERS_PER_TILE>
+struct Smem_tile_v<Ampere_hmma_bf16_traits, Cta_tile, BUFFERS_PER_TILE>
+    : public Smem_tile_v_ampere_hmma<Ampere_hmma_bf16_traits, Cta_tile, BUFFERS_PER_TILE> {
+  // The base class.
+  using Base = Smem_tile_v_ampere_hmma<Ampere_hmma_bf16_traits, Cta_tile, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_v(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+template <typename Traits, typename Cta_tile, int BUFFERS_PER_TILE>
+struct Smem_tile_v_ampere_8bit_mma
+    : public Smem_tile_v_imma<Traits, Cta_tile, Rows_per_xor_pattern_ampere_b,
+                              BUFFERS_PER_TILE>::Base {
+  // The base class.
+  using Base = typename Smem_tile_v_imma<Traits, Cta_tile, Rows_per_xor_pattern_ampere_b,
+                                         BUFFERS_PER_TILE>::Base;
+  // The MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The fragment.
+  using Fragment = fmha::Fragment_b<Traits, fmha::Col>;
+
+  // The size of a single LDS in bytes.
+  enum { BYTES_PER_LDS = 16 };
+
+  // Ctor.
+  inline __device__ Smem_tile_v_ampere_8bit_mma(void* smem, int tidx) : Base(smem, tidx) {
+    // Warps.
+    enum { WARPS_M = Cta_tile::WARPS_M };
+
+    enum { WARPS_N = Cta_tile::WARPS_N };
+
+    enum { WARPS_K = Cta_tile::WARPS_K };
+
+    // Determine the config.
+    enum { WARPS_2x1x2 = WARPS_M == 2 && WARPS_N == 1 && WARPS_K == 2 };
+
+    enum { WARPS_1x1x8 = WARPS_M == 1 && WARPS_N == 1 && WARPS_K == 8 };
+
+    enum { WARPS_1x1x4 = WARPS_M == 1 && WARPS_N == 1 && WARPS_K == 4 };
+
+    enum { WARPS_4x1x1 = WARPS_M == 4 && WARPS_N == 1 && WARPS_K == 1 };
+
+    // The row/col read by the thread.
+    int read_row, read_col;
+
+    // SEQLEN == 128 and N == 16.
+    if (WARPS_2x1x2 && Cta_tile::N == 16) {
+      read_row = (tidx & 0x40) / 32 + (tidx & 0x08) / 8;
+      read_col = (tidx & 0x07);
+
+      // SEQLEN == 128 and N == 32.
+    } else if (WARPS_2x1x2 && Cta_tile::N == 32) {
+      read_row = (tidx & 0x40) / 16 + (tidx & 0x0c) / 4;
+      read_col = (tidx & 0x03) * 2 + (tidx & 0x04) / 4;
+
+      // SEQLEN == 128 and N == 64.
+    } else if (WARPS_2x1x2 && Cta_tile::N == 64) {
+      read_row = (tidx & 0x40) / 8 + (tidx & 0x0e) / 2;
+      read_col = (tidx & 0x01) * 4 + (tidx & 0x06) / 2;
+
+      // SEQLEN == 256, 512 and N == 16.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 16) {
+      read_row = (tidx & 0xe0) / 16 + (tidx & 0x08) / 8;
+      read_col = (tidx & 0x07);
+
+      // SEQLEN == 256, 512 and N == 32.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 32) {
+      read_row = (tidx & 0xe0) / 8 + (tidx & 0x0c) / 4;
+      read_col = (tidx & 0x03) * 2 + (tidx & 0x04) / 4;
+
+      // SEQLEN == 256, 384, 512 and N == 64.
+    } else if ((WARPS_1x1x8 || WARPS_1x1x4) && Cta_tile::N == 64) {
+      read_row = (tidx & 0xe0) / 4 + (tidx & 0x0e) / 2;
+      read_col = (tidx & 0x01) * 4 + (tidx & 0x06) / 2;
+    } else if (WARPS_4x1x1 && Cta_tile::N == 32) {
+      read_row = (tidx % 32) / 4;
+      read_col = read_row % 2 + (tidx % 4) * 2;
+    } else if (WARPS_4x1x1 && Cta_tile::N == 64) {
+      read_row = (tidx % 32) / 2;
+      read_col = read_row % 4 + (tidx & 0x01) * 4;
+    } else if (WARPS_4x1x1 && (Cta_tile::N >= 128)) {
+      read_row = tidx % 32;
+      read_col = tidx % 8;
+
+      // Not supported.
+    } else {
+      assert(false);
+    }
+
+    // The shared memory offset.
+    this->smem_read_offset_ = read_row * Base::BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+  }
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&b)[Mma_tile::VALID_MMAS_N], int ki) {
+// static_assert(Mma_tile::MMAS_K == 3 || Mma_tile::MMAS_K == 2 || Mma_tile::MMAS_K == 1, "");
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      // The amount of row packing.
+      enum { ROW_PACKING = Base::BYTES_PER_ROW / Base::BYTES_PER_ROW_BEFORE_PACKING };
+
+      // // Make sure we do not end up with weird values :)
+      // static_assert(Cta_tile::WARPS_K % ROW_PACKING == 0, "");
+
+      // Skip N paddings
+      if (ni < Mma_tile::VALID_MMAS_N) {
+        // Jump by 8*32 rows per K but account for the fact that we have packing.
+        int row_0 = (ki * 32 + 0 * 16) * Cta_tile::WARPS_K / ROW_PACKING;
+        int row_1 = (ki * 32 + 1 * 16) * Cta_tile::WARPS_K / ROW_PACKING;
+
+        // Load the data using LDSM.MT88.2.
+        uint32_t smem = this->smem_ + this->smem_read_offset_ + this->smem_read_buffer_;
+        uint2 tmp_0;
+        fmha::ldsmt(tmp_0, smem + row_0 * Base::BYTES_PER_ROW);
+
+        // Load the next two values.
+        uint2 tmp_1 = make_uint2(0u, 0u);
+        if constexpr (Cta_tile::K > 16) {
+          fmha::ldsmt(tmp_1, smem + row_1 * Base::BYTES_PER_ROW);
+        }
+
+        // Repack the elements. With LDSM.T, thread 0 has the following elements in its 4 regs:
+        //
+        //   R0 = [(n=0 k=  0), (n=1 k=  0), (n=0 k=  1), (n=1 k=  1)]
+        //   R1 = [(n=0 k=  8), (n=1 k=  8), (n=0 k=  9), (n=1 k=  9)]
+        //   R2 = [(n=0 k=128), (n=1 k=128), (n=0 k=129), (n=1 k=129)]
+        //   R3 = [(n=0 k=136), (n=1 k=136), (n=0 k=137), (n=1 k=137)]
+        //
+        // We want to repack the values as:
+        //
+        //   R0 = [(n=0 k=  0), (n=0 k=  1), (n=0 k=  8), (n=0 k=  9)]
+        //   R1 = [(n=0 k=128), (n=0 k=129), (n=0 k=136), (n=0 k=137)]
+        //   R2 = [(n=1 k=  0), (n=1 k=  1), (n=1 k=  8), (n=1 k=  9)]
+        //   R3 = [(n=1 k=128), (n=1 k=129), (n=1 k=136), (n=1 k=137)]
+        //
+        // Since this layout corresponds to the layout of elements in the Fragment_a from P.
+
+        swizzle_rows(b[ni].reg(0), b[ni].reg(2), tmp_0.x, tmp_0.y);
+        swizzle_rows(b[ni].reg(1), b[ni].reg(3), tmp_1.x, tmp_1.y);
+      }
+
+      // Move to the next N position.
+      if (Mma_tile::MMAS_N >= 32 && ni % 16 == 15) {
+        this->smem_read_offset_ ^= 31 * BYTES_PER_LDS;
+      } else if (Mma_tile::MMAS_N >= 16 && ni % 8 == 7) {
+        this->smem_read_offset_ ^= 15 * BYTES_PER_LDS;
+      } else if (Mma_tile::MMAS_N >= 8 && ni % 4 == 3) {
+        this->smem_read_offset_ ^= 7 * BYTES_PER_LDS;
+      } else if (Mma_tile::MMAS_N >= 4 && ni % 2 == 1) {
+        this->smem_read_offset_ ^= 3 * BYTES_PER_LDS;
+      } else if (Mma_tile::MMAS_N >= 2) {
+        this->smem_read_offset_ ^= 1 * BYTES_PER_LDS;
+      } else {
+        assert(false);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int BUFFERS_PER_TILE>
+struct Smem_tile_v<Ampere_imma_int8_int32_traits, Cta_tile, BUFFERS_PER_TILE>
+    : public Smem_tile_v_ampere_8bit_mma<Ampere_imma_int8_int32_traits, Cta_tile,
+                                         BUFFERS_PER_TILE> {
+  // The base class.
+  using Base =
+      Smem_tile_v_ampere_8bit_mma<Ampere_imma_int8_int32_traits, Cta_tile, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_v(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int BUFFERS_PER_TILE>
+struct Smem_tile_v<Ada_qmma_e4m3_fp32_traits, Cta_tile, BUFFERS_PER_TILE>
+    : public Smem_tile_v_ampere_8bit_mma<Ada_qmma_e4m3_fp32_traits, Cta_tile, BUFFERS_PER_TILE> {
+  // The base class.
+  using Base = Smem_tile_v_ampere_8bit_mma<Ada_qmma_e4m3_fp32_traits, Cta_tile, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_v(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int BUFFERS_PER_TILE>
+struct Smem_tile_v<Ada_qmma_e4m3_fp16_traits, Cta_tile, BUFFERS_PER_TILE>
+    : public Smem_tile_v_ampere_8bit_mma<Ada_qmma_e4m3_fp16_traits, Cta_tile, BUFFERS_PER_TILE> {
+  // The base class.
+  using Base = Smem_tile_v_ampere_8bit_mma<Ada_qmma_e4m3_fp16_traits, Cta_tile, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_v(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/softmax.h b/csrc/fmha_v2/fmha/softmax.h
new file mode 100644
index 0000000000..68ecea49b9
--- /dev/null
+++ b/csrc/fmha_v2/fmha/softmax.h
@@ -0,0 +1,3964 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <cfloat>
+
+#include "fmha/fragment.h"
+#include "fmha/utils.h"
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Sum_ {
+  enum { IS_SUM = 1 };
+
+  static inline __device__ float apply(float x, float y) { return x + y; }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Max_ {
+  enum { IS_SUM = 0 };
+
+  static inline __device__ float apply(float x, float y) { return fmaxf(x, y); }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int FMHA_VERSION>
+inline __device__ float apply_exp_(float x, float max) {
+  return isinf(x) ? 0.f : __expf(x - max);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+inline __device__ float apply_exp_<2>(float x, float max) {
+  return __expf(x - max);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename AlibiParams>
+inline __device__ float get_alibi_head_scaling_factor(int const in_head_id,
+                                                      AlibiParams const& params) {
+  int const head_id = params.head_idx_offset + in_head_id;
+  if (head_id < params.h_pow_2) {
+    // 2^(head_id * -8 / h)
+    return exp2f((head_id + 1) * 2 * params.alibi_neg4_div_h) * params.scale_after_alibi;
+  } else {
+    // 1,3,5... etc
+    float const adjusted_head_id = 2 * (head_id - params.h_pow_2) + 1;
+    // 2^(adjusted_head_id * -4 / h)
+    return exp2f(adjusted_head_id * params.alibi_neg4_div_h) * params.scale_after_alibi;
+    ;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int COLS>
+struct ReadType {
+  using T = float;
+};
+
+template <>
+struct ReadType<4> {
+  using T = float;
+};
+
+template <>
+struct ReadType<8> {
+  using T = float2;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, typename Kernel_traits>
+struct Smem_tile_reduce {
+  // Helper class to distribute MMA tiles reduced over rows per warp over quads.
+
+  // The Mma tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of MMAs in M/N dimensions.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  enum { MMAS_N = Mma_tile::MMAS_N };
+
+  enum { WARPS_M = Cta_tile::WARPS_M };
+
+  enum { WARPS_N = Cta_tile::WARPS_N };
+
+  static constexpr int ROWS = WARPS_M * MMAS_M * 16;
+  static constexpr int COLS = WARPS_N;
+  static constexpr int ROWS_PER_XOR_PATTERN = (COLS == 8) ? 4 : 8;
+  static constexpr int BYTES_PER_TILE = ROWS * COLS * sizeof(float);
+  static constexpr int ELTS_PER_TILE = ROWS * COLS;
+
+  static constexpr int THREADS_PER_GROUP = Kernel_traits::Gmem_tile_o::THREADS_PER_ROW;
+  static constexpr int ROWS_PER_WARP = 32 / THREADS_PER_GROUP;
+  static constexpr int LOOPS = Kernel_traits::Gmem_tile_o::LOOPS;
+
+  using read_t = typename ReadType<COLS>::T;
+
+  __device__ inline Smem_tile_reduce(float* smem_, int const tidx) {
+    int lane = tidx % 32;
+    int warp = tidx / 32;
+
+    int warp_m = warp % WARPS_M;
+    int warp_n = warp / WARPS_M;
+
+    qid_ = lane % 4;
+    int qp = lane / 4;
+
+    // Swizzle the column to avoid 2-fold bank conflicts when we have 8 warps.
+    // This won't affect reading as we assume commutative reduction ops.
+    int const col = warp_n ^ (qp / ROWS_PER_XOR_PATTERN);
+    smem_write_ = &smem_[warp_m * 16 * MMAS_M * WARPS_N + qp * WARPS_N + col];
+    smem_read_ = &reinterpret_cast<read_t*>(smem_)[warp_m * 16 * MMAS_M * 4 + qp * 4 + qid_];
+  }
+
+  __device__ inline void store(float (&frag)[2 * MMAS_M]) {
+    if (qid_ == 0) {
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M; mi++) {
+        int offset = mi * 16 * WARPS_N;
+        smem_write_[offset + 0 * 8 * WARPS_N] = frag[mi * 2 + 0];
+        smem_write_[offset + 1 * 8 * WARPS_N] = frag[mi * 2 + 1];
+      }
+    }
+  }
+
+  __device__ inline void load(read_t (&frag)[2 * MMAS_M]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; mi++) {
+      int offset = mi * 16 * 4;
+      frag[mi * 2 + 0] = smem_read_[offset + 0 * 8 * 4];
+      frag[mi * 2 + 1] = smem_read_[offset + 1 * 8 * 4];
+    }
+  }
+
+  int qid_;
+  float* smem_write_;
+  read_t* smem_read_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, typename Kernel_traits>
+struct Softmax_base {
+  // The Mma tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of MMAs in M/N dimensions.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  enum { MMAS_N = Mma_tile::MMAS_N };
+
+  // The number of groups of warp such that we have at most 4 warps writing consecutive elements.
+  enum { GROUPS = fmha::Div_up<Cta_tile::WARPS_N, 4>::VALUE };
+
+  // The number of elements that we are going to store per row.
+  enum { ELEMENTS_PER_ROW = Cta_tile::WARPS_N / GROUPS };
+
+  // The number of rows.
+  enum { ROWS = Cta_tile::M * GROUPS };
+
+  // The total number of elements.
+  enum { ELEMENTS = ROWS * ELEMENTS_PER_ROW };
+
+  // If shared memory is used
+  enum { USE_SHARED_MEMORY = Cta_tile::WARPS_N > 1 };
+
+  // DEBUG.
+  static_assert(ELEMENTS == Cta_tile::M * Cta_tile::WARPS_N, "");
+
+  // END OF DEBUG.
+
+  // The number of rows per thread.
+  enum { ROWS_PER_THREAD = MMAS_M * 2 };
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax_base(Params const& params, void* smem, int bidb, int tidx)
+      : smem_(reinterpret_cast<float*>(smem)), tidx_(tidx) {
+    // Extract the position in the warp.
+    int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+    // Decompose the warp index into M and N.
+    int warp_m = warp % Cta_tile::WARPS_M;
+    int warp_n = warp / Cta_tile::WARPS_M;
+
+    // Decompose the warp-n index into group/position-inside-the-group.
+    int warp_g = warp_n / ELEMENTS_PER_ROW;
+    int warp_i = warp_n % ELEMENTS_PER_ROW;
+
+    // The location written by the threads.
+    int write_row = warp_g * Cta_tile::M + warp_m * Mma_tile::M_PER_MMA + lane / 4;
+    int write_col = warp_i;
+
+    // Assemble the write pointer.
+    smem_write_ = &smem_[write_row * ELEMENTS_PER_ROW + write_col];
+
+    // Assemble the read pointer.
+    smem_read_ = &smem_[warp_m * Mma_tile::M_PER_MMA + lane / 4];
+  }
+
+  // Apply mask before softmax. Use 1 byte per MMA distributed as 2x4.
+  template <typename Mask>
+  inline __device__ void apply_mask(Mask const& mask) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ii = 0; ii < 2; ++ii) {
+#pragma unroll
+        for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+          for (int jj = 0; jj < 4; ++jj) {
+            if (!mask.is_valid(mi, ni, ii, jj)) {
+              elt_[2 * mi + ii][4 * ni + jj] = -FLT_MAX;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  template <typename Mask, typename AlibiParams>
+  inline __device__ void apply_mask_alibi(Mask const& mask, int head_id,
+                                          AlibiParams const& alibi_params) {
+    // 'if constexpr' because ALiBi is only defined for causal masks
+    if constexpr (Kernel_traits::CAUSAL_MASK) {
+      float m = get_alibi_head_scaling_factor<AlibiParams>(head_id, alibi_params);
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+        for (int ii = 0; ii < 2; ++ii) {
+#pragma unroll
+          for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+            for (int jj = 0; jj < 4; ++jj) {
+              int row, col;
+              mask.get_row_col(row, col, mi, ni, ii, jj);
+              if (mask.is_valid(row, col)) {
+                // Since softmax is shift invariant,
+                //  it is sufficient just to use the column as the multiplier
+                elt_[2 * mi + ii][4 * ni + jj] =
+                    elt_[2 * mi + ii][4 * ni + jj] * alibi_params.scale_after_alibi +
+                    m * (col + alibi_params.sequence_pos_offset);
+              } else {
+                elt_[2 * mi + ii][4 * ni + jj] = -FLT_MAX;
+              }
+            }
+          }
+        }
+      }
+    } else {
+      __builtin_unreachable();
+    }
+  }
+
+  // Apply the mask to unpacked data.
+  inline __device__ void apply_mask(uint32_t const (&packed_mask)[MMAS_M]) {
+    // This code works only if we have MMAS_N <= 4.
+    static_assert(MMAS_N <= 4, "");
+
+    // Expand the mask.
+    int mask[MMAS_M * 2][MMAS_N * 4];
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        mask[2 * mi + 0][4 * ni + 0] = packed_mask[mi] & (1u << (8 * ni + 0));
+        mask[2 * mi + 0][4 * ni + 1] = packed_mask[mi] & (1u << (8 * ni + 1));
+        mask[2 * mi + 1][4 * ni + 0] = packed_mask[mi] & (1u << (8 * ni + 2));
+        mask[2 * mi + 1][4 * ni + 1] = packed_mask[mi] & (1u << (8 * ni + 3));
+        mask[2 * mi + 0][4 * ni + 2] = packed_mask[mi] & (1u << (8 * ni + 4));
+        mask[2 * mi + 0][4 * ni + 3] = packed_mask[mi] & (1u << (8 * ni + 5));
+        mask[2 * mi + 1][4 * ni + 2] = packed_mask[mi] & (1u << (8 * ni + 6));
+        mask[2 * mi + 1][4 * ni + 3] = packed_mask[mi] & (1u << (8 * ni + 7));
+      }
+    }
+
+// Apply the mask.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * 4; ++ni) {
+        if (!mask[mi][ni]) {
+          elt_[mi][ni] = -FLT_MAX;
+        }
+      }
+    }
+  }
+
+  // Mask the elements that are outside the the sequence length.
+  inline __device__ void apply_mask(int const actual_seqlen) {
+    // The warp/lane decomposition.
+    int const warp = threadIdx.x / Cta_tile::THREADS_PER_WARP;
+    int const lane = threadIdx.x % Cta_tile::THREADS_PER_WARP;
+
+    // The warp in the n dimension.
+    int const warp_n = warp / Cta_tile::WARPS_M;
+    // The position within a quad.
+    int const quad_lane = lane % 4;
+
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // Determine the position in the sequence.
+        int const offset = ni * Mma_tile::N_PER_MMA_PER_CTA + warp_n * 16;
+        if (offset + 0 + 2 * quad_lane >= actual_seqlen) {
+          elt_[mi][4 * ni + 0] = -FLT_MAX;  // 0
+        }
+        if (offset + 1 + 2 * quad_lane >= actual_seqlen) {
+          elt_[mi][4 * ni + 1] = -FLT_MAX;  // 1
+        }
+        if (offset + 8 + 2 * quad_lane >= actual_seqlen) {
+          elt_[mi][4 * ni + 2] = -FLT_MAX;  // 8
+        }
+        if (offset + 9 + 2 * quad_lane >= actual_seqlen) {
+          elt_[mi][4 * ni + 3] = -FLT_MAX;  // 9
+        }
+      }
+    }
+  }
+
+  // Apply the exp to all the elements.
+  inline __device__ void apply_exp(float const max) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * 4; ++ni) {
+        elt_[mi][ni] = apply_exp_<Kernel_traits::VERSION>(elt_[mi][ni], max);
+      }
+    }
+  }
+
+  // Apply the exp to all the elements.
+  inline __device__ void apply_scale_exp(float const (&max)[MMAS_M * 2], float scale_bmm1) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * 4; ++ni) {
+        elt_[mi][ni] = apply_exp_<Kernel_traits::VERSION>(scale_bmm1 * elt_[mi][ni], max[mi]);
+      }
+    }
+  }
+
+  // Apply the exp to all the elements.
+  inline __device__ void apply_exp(float const (&max)[MMAS_M * 2]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * 4; ++ni) {
+        elt_[mi][ni] = apply_exp_<Kernel_traits::VERSION>(elt_[mi][ni], max[mi]);
+      }
+    }
+  }
+
+  // Do a warp-wide reduction.
+  template <typename Functor>
+  inline __device__ void reduce_Nx1(float (&dst)[MMAS_M * 2]) {
+#if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+    if (Functor::IS_SUM) {
+// Apply the summation inside the thread.
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+        float tmp[2] = {0.f, 0.f};
+#pragma unroll
+        for (int ni = 0; ni < MMAS_N; ++ni) {
+          tmp[0] += elt_[mi][4 * ni + 0] + elt_[mi][4 * ni + 1];
+          tmp[1] += elt_[mi][4 * ni + 2] + elt_[mi][4 * ni + 3];
+        }
+        dst[mi] = tmp[0] + tmp[1];
+      }
+    } else
+#endif  // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+    {
+// Apply the functor for each row inside a thread.
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+        dst[mi] = elt_[mi][0];
+#pragma unroll
+        for (int ni = 1; ni < MMAS_N * 4; ++ni) {
+          dst[mi] = Functor::apply(dst[mi], elt_[mi][ni]);
+        }
+      }
+    }
+
+// Apply the functor for each row inside each group of 4 threads.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+      dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 1));
+      dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 2));
+    }
+  }
+
+  // Do a CTA-wide reduction.
+  template <typename Functor>
+  inline __device__ float reduce_2x2() {
+    float dst[MMAS_M * 2];
+#if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+    if (Functor::IS_SUM) {
+// Apply the summation inside the thread.
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+        // Pair-wise adds in the different threads of the reference code (x+y and z+w).
+        float a_01 = elt_[mi][0] + elt_[mi][1];
+        float a_45 = elt_[mi][4] + elt_[mi][5];
+
+        //// tmp[0/1] += __shfl_xor(2) in the reference code.
+        a_01 += elt_[mi][2] + elt_[mi][3];
+        a_45 += elt_[mi][6] + elt_[mi][7];
+
+        //// tmp[0/1] += __shfl_xor(8) in the reference code.
+        a_01 += a_45;
+
+        if (MMAS_N >= 3) {
+          float a_89 = elt_[mi][8] + elt_[mi][9];
+          a_89 += elt_[mi][10] + elt_[mi][11];
+          if (MMAS_N == 4) {
+            float a_cd = elt_[mi][12] + elt_[mi][13];
+            a_cd += elt_[mi][14] + elt_[mi][15];
+            a_89 += a_cd;
+          }
+          a_01 += a_89;
+        }
+        dst[mi] = a_01;
+      }
+    } else
+#endif  // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+    {
+// Apply the functor for each row inside a thread.
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+        dst[mi] = elt_[mi][0];
+#pragma unroll
+        for (int ni = 1; ni < MMAS_N * 4; ++ni) {
+          dst[mi] = Functor::apply(dst[mi], elt_[mi][ni]);
+        }
+      }
+    }
+
+// Apply the functor for each row inside each group of 4 threads.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+      dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 1));
+      dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 2));
+    }
+
+// Store the different values.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      if (tidx_ % 4 == 0) {
+        smem_write_[(mi * Mma_tile::M_PER_MMA_PER_CTA + 0) * ELEMENTS_PER_ROW] = dst[2 * mi + 0];
+        smem_write_[(mi * Mma_tile::M_PER_MMA_PER_CTA + 8) * ELEMENTS_PER_ROW] = dst[2 * mi + 1];
+      }
+    }
+
+    // Make sure the values are in shared memory.
+    __syncthreads();
+
+    // Load 2 values (one for each warp).
+    float2 tmp = reinterpret_cast<float2 const*>(smem_)[tidx_];
+
+    // Compute the reduction of those 2 values in a binary-tree fashion.
+    return Functor::apply(tmp.x, tmp.y);
+  }
+
+  // Do a CTA-wide reduction.
+  template <typename Functor>
+  inline __device__ float reduce_1x4() {
+    float dst[MMAS_M * 2];
+#if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+    if (Functor::IS_SUM) {
+// Apply the summation inside the thread.
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+        float tmp[2] = {0.f, 0.f};
+#pragma unroll
+        for (int ni = 0; ni < MMAS_N; ++ni) {
+          tmp[0] += elt_[mi][4 * ni + 0] + elt_[mi][4 * ni + 1];
+          tmp[1] += elt_[mi][4 * ni + 2] + elt_[mi][4 * ni + 3];
+        }
+        dst[mi] = tmp[0] + tmp[1];
+      }
+    } else
+#endif  // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+    {
+// Apply the functor for each row inside a thread.
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+        dst[mi] = elt_[mi][0];
+#pragma unroll
+        for (int ni = 1; ni < MMAS_N * 4; ++ni) {
+          dst[mi] = Functor::apply(dst[mi], elt_[mi][ni]);
+        }
+      }
+    }
+
+// Apply the functor for each row inside each group of 4 threads.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+      dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 1));
+      dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 2));
+    }
+
+// Store the different values.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      if (tidx_ % 4 == 0) {
+        smem_write_[(mi * Mma_tile::M_PER_MMA_PER_CTA + 0) * ELEMENTS_PER_ROW] = dst[2 * mi + 0];
+        smem_write_[(mi * Mma_tile::M_PER_MMA_PER_CTA + 8) * ELEMENTS_PER_ROW] = dst[2 * mi + 1];
+      }
+    }
+
+    // Make sure the values are in shared memory.
+    __syncthreads();
+
+    // Load 8 values (one for each warp). The /8 corresponds to /(4*2) where 4 is from the
+    // float4.
+    float4 tmp[1];
+    if (tidx_ < Cta_tile::M) {
+      tmp[0] = reinterpret_cast<float4 const*>(&smem_[0 * ELEMENTS / 2])[tidx_];
+    }
+
+    // Compute the reduction of those 8 values in a binary-tree fashion.
+    tmp[0].x = Functor::apply(tmp[0].x, tmp[0].y);
+    tmp[0].z = Functor::apply(tmp[0].z, tmp[0].w);
+    tmp[0].x = Functor::apply(tmp[0].x, tmp[0].z);
+
+    // Return the final reduction.
+    return tmp[0].x;
+  }
+
+  // Do a CTA-wide reduction.
+  template <typename Functor>
+  inline __device__ float reduce_1x8() {
+    float dst[MMAS_M * 2];
+#if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+    if (Functor::IS_SUM) {
+      // Apply the summation inside the thread.
+      float tmp[MMAS_M * 2][2];
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+        tmp[mi][0] = 0.f;
+        tmp[mi][1] = 0.f;
+#pragma unroll
+        for (int ni = 0; ni < MMAS_N; ++ni) {
+          tmp[mi][0] += elt_[mi][4 * ni + 0];
+          tmp[mi][0] += elt_[mi][4 * ni + 1];
+          tmp[mi][1] += elt_[mi][4 * ni + 2];
+          tmp[mi][1] += elt_[mi][4 * ni + 3];
+        }
+        dst[mi] = tmp[mi][0] + tmp[mi][1];
+      }
+    } else
+#endif  // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+    {
+// Apply the functor for each row inside a thread.
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+        dst[mi] = elt_[mi][0];
+#pragma unroll
+        for (int ni = 1; ni < MMAS_N * 4; ++ni) {
+          dst[mi] = Functor::apply(dst[mi], elt_[mi][ni]);
+        }
+      }
+    }
+
+// Apply the functor for each row inside each group of 4 threads.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+      dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 1));
+      dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 2));
+    }
+
+// Store the different values.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      if (tidx_ % 4 == 0) {
+        smem_write_[(mi * Mma_tile::M_PER_MMA_PER_CTA + 0) * ELEMENTS_PER_ROW] = dst[2 * mi + 0];
+        smem_write_[(mi * Mma_tile::M_PER_MMA_PER_CTA + 8) * ELEMENTS_PER_ROW] = dst[2 * mi + 1];
+      }
+    }
+
+    // Make sure the values are in shared memory.
+    __syncthreads();
+
+    // Load 8 values (one for each warp). The /8 corresponds to /(4*2) where 4 is from the
+    // float4.
+    float4 tmp[2];
+    if (tidx_ < Cta_tile::M) {
+      tmp[0] = reinterpret_cast<float4 const*>(&smem_[0 * ELEMENTS / 2])[tidx_];
+      tmp[1] = reinterpret_cast<float4 const*>(&smem_[1 * ELEMENTS / 2])[tidx_];
+    }
+
+    // Compute the reduction of those 8 values in a binary-tree fashion.
+    tmp[0].x = Functor::apply(tmp[0].x, tmp[0].y);
+    tmp[0].z = Functor::apply(tmp[0].z, tmp[0].w);
+    tmp[1].x = Functor::apply(tmp[1].x, tmp[1].y);
+    tmp[1].z = Functor::apply(tmp[1].z, tmp[1].w);
+    tmp[0].x = Functor::apply(tmp[0].x, tmp[0].z);
+    tmp[1].x = Functor::apply(tmp[1].x, tmp[1].z);
+    tmp[0].x = Functor::apply(tmp[0].x, tmp[1].x);
+
+    // Return the result.
+    return tmp[0].x;
+  }
+
+  // Do a CTA-wide reduction.
+  template <typename Functor>
+  inline __device__ float reduce_() {
+    // The result of the reduction. Threads 0..Cta_tile::M-1 own a single row value.
+    float red = 0.f;
+
+    // SEQLEN == 128.
+    if (Cta_tile::WARPS_M == 2 && Cta_tile::WARPS_N == 2) {
+      red = reduce_2x2<Functor>();
+
+      // SEQLEN == 256.
+    } else if (Cta_tile::WARPS_M == 1 && Cta_tile::WARPS_N == 4) {
+      red = reduce_1x4<Functor>();
+
+      // SEQLEN == 384.
+    } else if (Cta_tile::WARPS_M == 1 && Cta_tile::WARPS_N == 8) {
+      red = reduce_1x8<Functor>();
+
+      // Not supported.
+    } else {
+      assert(false);
+    }
+
+    return red;
+  }
+
+  // Finalize the reduction.
+  inline __device__ void shuffle(float (&dst)[MMAS_M * 2], float red) {
+    // Store the value back to shared memory.
+    if (tidx_ < Cta_tile::M) {
+      smem_[tidx_] = red;
+    }
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+
+// Finally read the values.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      dst[2 * mi + 0] = smem_read_[mi * Mma_tile::M_PER_MMA_PER_CTA + 0];
+      dst[2 * mi + 1] = smem_read_[mi * Mma_tile::M_PER_MMA_PER_CTA + 8];
+    }
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+  }
+
+  // Do a CTA-wide reduction.
+  template <typename Functor>
+  inline __device__ void reduce(float (&dst)[MMAS_M * 2]) {
+    // NOTE: 1 warp along reduce direction, no syncs
+    if (Cta_tile::WARPS_N == 1) {
+      reduce_Nx1<Functor>(dst);
+    } else {
+      // The result of the reduction. Threads 0..Cta_tile::M-1 own a single row value.
+      float red = reduce_<Functor>();
+
+      // Make sure we can write to shared memory.
+      __syncthreads();
+
+      // Finalize the reduction.
+      shuffle(dst, red);
+    }
+  }
+
+  // Scale all the elements.
+  inline __device__ void scale(float const (&sum)[MMAS_M * 2]) {
+    // Precompute the inverse sum to normalize. Without -use_fast_math, it makes a huge deal.
+    float inv_sum[MMAS_M * 2];
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+      inv_sum[mi] = (sum[mi] == 0.f || sum[mi] != sum[mi]) ? 1.f : 1.f / sum[mi];
+    }
+
+// Update the values.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * 4; ++ni) {
+        elt_[mi][ni] *= inv_sum[mi];
+      }
+    }
+  }
+
+  // Shared memory for the CTA-wide reduction.
+  float *smem_, *smem_write_, *smem_read_;
+  // The current thread index.
+  int tidx_;
+  // The elements.
+  float elt_[MMAS_M * 2][MMAS_N * 4];
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, typename Kernel_traits>
+struct Softmax_hmma : public Softmax_base<Traits, Cta_tile, Kernel_traits> {
+  // The base class.
+  using Base = Softmax_base<Traits, Cta_tile, Kernel_traits>;
+
+  // The MMAs.
+  enum { MMAS_M = Base::MMAS_M };
+
+  enum { MMAS_N = Base::MMAS_N };
+
+  // Whether we need to skip the softmax due to the sliding-window attention
+  // Otherwise, we will get NANs as those tokens are all masked out.
+  enum { SLIDING_WINDOW_ATTENTION = Kernel_traits::SLIDING_WINDOW_ATTENTION };
+
+  // Use BMM1 softcapping scale or not.
+  enum { ENABLE_BMM1_SOFTCAPPING_SCALE = Kernel_traits::ENABLE_BMM1_SOFTCAPPING_SCALE };
+
+  // The accumulators.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // Softmax dst data_type (BMM2 input)
+  using Dst_type = typename Traits::A_type;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax_hmma(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx),
+        params_scale_bmm1_(params.scale_bmm1),
+        params_softcapping_scale_bmm1_(params.softcapping_scale_bmm1) {}
+
+  // Store the tile after softmax.
+  template <typename Gmem_tile>
+  inline __device__ void store(Gmem_tile& gmem_tile) {
+    Accumulator acc[MMAS_M][MMAS_N];
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // The elements.
+        float tmp_00 = this->elt_[2 * mi + 0][4 * ni + 0];
+        float tmp_01 = this->elt_[2 * mi + 0][4 * ni + 1];
+        float tmp_02 = this->elt_[2 * mi + 0][4 * ni + 2];
+        float tmp_03 = this->elt_[2 * mi + 0][4 * ni + 3];
+        float tmp_10 = this->elt_[2 * mi + 1][4 * ni + 0];
+        float tmp_11 = this->elt_[2 * mi + 1][4 * ni + 1];
+        float tmp_12 = this->elt_[2 * mi + 1][4 * ni + 2];
+        float tmp_13 = this->elt_[2 * mi + 1][4 * ni + 3];
+
+        // Transform to accumulators.
+        acc[mi][ni].reg(0) = fmha::float2_to_16bit_2<Dst_type>(tmp_00, tmp_01);
+        acc[mi][ni].reg(1) = fmha::float2_to_16bit_2<Dst_type>(tmp_10, tmp_11);
+        acc[mi][ni].reg(2) = fmha::float2_to_16bit_2<Dst_type>(tmp_02, tmp_03);
+        acc[mi][ni].reg(3) = fmha::float2_to_16bit_2<Dst_type>(tmp_12, tmp_13);
+      }
+    }
+
+    // Delegate to the gmem tile to store.
+    gmem_tile.store(acc);
+  }
+
+  // Convert from FP16 fragments to floats.
+  inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // Normalize the values, and clamp to finite half.
+        uint32_t acc_0 = satfinite_h2(hmul2(acc[mi][ni].reg(0), params_scale_bmm1_));
+        uint32_t acc_1 = satfinite_h2(hmul2(acc[mi][ni].reg(1), params_scale_bmm1_));
+        uint32_t acc_2 = satfinite_h2(hmul2(acc[mi][ni].reg(2), params_scale_bmm1_));
+        uint32_t acc_3 = satfinite_h2(hmul2(acc[mi][ni].reg(3), params_scale_bmm1_));
+
+        // Extract the values as floats.
+        half2_to_float2(this->elt_[2 * mi + 0][4 * ni + 0], this->elt_[2 * mi + 0][4 * ni + 1],
+                        acc_0);
+        half2_to_float2(this->elt_[2 * mi + 1][4 * ni + 0], this->elt_[2 * mi + 1][4 * ni + 1],
+                        acc_1);
+        half2_to_float2(this->elt_[2 * mi + 0][4 * ni + 2], this->elt_[2 * mi + 0][4 * ni + 3],
+                        acc_2);
+        half2_to_float2(this->elt_[2 * mi + 1][4 * ni + 2], this->elt_[2 * mi + 1][4 * ni + 3],
+                        acc_3);
+
+        // Attention logit softcapping scale.
+        // 1.0f / softcapping_scale has been fused to scale_bmm1.
+        if constexpr (ENABLE_BMM1_SOFTCAPPING_SCALE) {
+          this->elt_[2 * mi + 0][4 * ni + 0] =
+              params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 0][4 * ni + 0]);
+          this->elt_[2 * mi + 0][4 * ni + 1] =
+              params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 0][4 * ni + 1]);
+          this->elt_[2 * mi + 1][4 * ni + 0] =
+              params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 1][4 * ni + 0]);
+          this->elt_[2 * mi + 1][4 * ni + 1] =
+              params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 1][4 * ni + 1]);
+          this->elt_[2 * mi + 0][4 * ni + 2] =
+              params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 0][4 * ni + 2]);
+          this->elt_[2 * mi + 0][4 * ni + 3] =
+              params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 0][4 * ni + 3]);
+          this->elt_[2 * mi + 1][4 * ni + 2] =
+              params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 1][4 * ni + 2]);
+          this->elt_[2 * mi + 1][4 * ni + 3] =
+              params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 1][4 * ni + 3]);
+        }
+      }
+    }
+  }
+
+  // Apply the exp to all the elements.
+  // Need to make sure the results are zero when all elts are -FLT_MAX
+  //  as it is possible that all tokens are masked out.
+  template <bool APPLY_MASK = false>
+  inline __device__ void apply_exp_with_mask(float const (&max)[MMAS_M * 2]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+      float max_val = APPLY_MASK && max[mi] == -FLT_MAX ? 0.f : max[mi];
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * 4; ++ni) {
+        this->elt_[mi][ni] = expf(this->elt_[mi][ni] - max_val);
+      }
+    }
+  }
+
+  // The scaling factor.
+  uint32_t const params_scale_bmm1_;
+  float const params_softcapping_scale_bmm1_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits>
+struct Fragment_helper {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_helper<fmha::Volta_imma_int8_int32_traits> {
+  // The traits.
+  using Traits = fmha::Volta_imma_int8_int32_traits;
+  // The fragment A.
+  using Fragment_a = fmha::Fragment_a<Traits, fmha::Row>;
+  // The accumulator.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // Load a 2x4 array from registers.
+  static inline __device__ void load(int32_t (&dst)[2][4], Accumulator const& src) {
+    dst[0][0] = src.elt(0);
+    dst[0][1] = src.elt(1);
+    dst[0][2] = src.elt(2);
+    dst[0][3] = src.elt(3);
+    dst[1][0] = src.elt(4);
+    dst[1][1] = src.elt(5);
+    dst[1][2] = src.elt(6);
+    dst[1][3] = src.elt(7);
+  }
+
+  // Store to an accumulator.
+  static inline __device__ void store(Accumulator& dst, uint32_t const (&src)[2][4]) {
+    dst.reg(0) = src[0][0];
+    dst.reg(1) = src[0][1];
+    dst.reg(2) = src[0][2];
+    dst.reg(3) = src[0][3];
+    dst.reg(4) = src[1][0];
+    dst.reg(5) = src[1][1];
+    dst.reg(6) = src[1][2];
+    dst.reg(7) = src[1][3];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_helper<fmha::Turing_imma_int8_int32_traits> {
+  // The traits.
+  using Traits = fmha::Turing_imma_int8_int32_traits;
+  // The fragment A.
+  using Fragment_a = fmha::Fragment_a<Traits, fmha::Row>;
+  // The accumulator.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // Load a 2x4 array from registers.
+  static inline __device__ void load(int32_t (&dst)[2][4], Accumulator const& src) {
+    dst[0][0] = src.elt(0);
+    dst[0][1] = src.elt(1);
+    dst[0][2] = src.elt(2);
+    dst[0][3] = src.elt(3);
+    dst[1][0] = src.elt(4);
+    dst[1][1] = src.elt(5);
+    dst[1][2] = src.elt(6);
+    dst[1][3] = src.elt(7);
+  }
+
+  // Store to an accumulator.
+  static inline __device__ void store(Accumulator& dst, uint32_t const (&src)[2][4]) {
+    dst.reg(0) = src[0][0];
+    dst.reg(1) = src[0][1];
+    dst.reg(2) = src[0][2];
+    dst.reg(3) = src[0][3];
+    dst.reg(4) = src[1][0];
+    dst.reg(5) = src[1][1];
+    dst.reg(6) = src[1][2];
+    dst.reg(7) = src[1][3];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Fragment_helper<fmha::Ampere_imma_int8_int32_traits> {
+  // The traits.
+  using Traits = fmha::Ampere_imma_int8_int32_traits;
+  // The fragment A.
+  using Fragment_a = fmha::Fragment_a<Traits, fmha::Row>;
+  // The accumulator.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // Load a 2x4 array from registers.
+  static inline __device__ void load(int32_t (&dst)[2][4], Accumulator const& src) {
+    dst[0][0] = src.elt(0);
+    dst[0][1] = src.elt(1);
+    dst[0][2] = src.elt(4);
+    dst[0][3] = src.elt(5);
+    dst[1][0] = src.elt(2);
+    dst[1][1] = src.elt(3);
+    dst[1][2] = src.elt(6);
+    dst[1][3] = src.elt(7);
+  }
+
+  // Store to an accumulator.
+  static inline __device__ void store(Accumulator& dst, uint32_t const (&src)[2][4]) {
+    dst.reg(0) = src[0][0];
+    dst.reg(1) = src[0][1];
+    dst.reg(4) = src[0][2];
+    dst.reg(5) = src[0][3];
+    dst.reg(2) = src[1][0];
+    dst.reg(3) = src[1][1];
+    dst.reg(6) = src[1][2];
+    dst.reg(7) = src[1][3];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, typename Kernel_traits>
+struct Softmax_imma : public Softmax_base<Traits, Cta_tile, Kernel_traits> {
+  // The base class.
+  using Base = Softmax_base<Traits, Cta_tile, Kernel_traits>;
+  // The Mma tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The MMAs.
+  enum { MMAS_M = Base::MMAS_M };
+
+  enum { MMAS_N = Base::MMAS_N };
+
+  // The accumulators.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+  // The fragment.
+  using Fragment_a = fmha::Fragment_a<Traits, fmha::Row>;
+
+  // The dst type
+  using Dst_type = typename Traits::A_type;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax_imma(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx),
+        params_scale_bmm1_(params.scale_bmm1),
+        params_scale_softmax_(params.scale_softmax) {}
+
+  // Store the tile after softmax.
+  template <typename Gmem_tile>
+  inline __device__ void store(Gmem_tile& gmem_tile) {
+    float const scale = reinterpret_cast<float const&>(params_scale_softmax_);
+    Accumulator acc[MMAS_M][MMAS_N];
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // Scale the FP32 elements.
+        uint32_t tmp[2][4];
+#pragma unroll
+        for (int mj = 0; mj < 2; ++mj) {
+#pragma unroll
+          for (int nj = 0; nj < 4; ++nj) {
+            float f = this->elt_[2 * mi + mj][4 * ni + nj] * scale;
+            asm volatile("cvt.rni.sat.s8.f32 %0, %1;\n" : "=r"(tmp[mj][nj]) : "f"(f));
+          }
+        }
+
+        // Convert to int8 and store.
+        Fragment_helper<Traits>::store(acc[mi][ni], tmp);
+      }
+    }
+
+    // Delegate to the gmem tile to store.
+    gmem_tile.store(acc);
+  }
+
+  // Convert from accumulators to floats.
+  inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) {
+    float const scale = reinterpret_cast<float const&>(params_scale_bmm1_);
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // Load the values from the accumulator's registers.
+        int32_t tmp[2][4];
+        Fragment_helper<Traits>::load(tmp, acc[mi][ni]);
+
+// Convert to FP32 and scale.
+#pragma unroll
+        for (int mj = 0; mj < 2; ++mj) {
+#pragma unroll
+          for (int nj = 0; nj < 4; ++nj) {
+#if defined(USE_I2F_EMULATION_TRICK)
+            float f = reinterpret_cast<float const&>(tmp[mj][nj]);
+            this->elt_[2 * mi + mj][4 * ni + nj] = (f - FP32_I2F_MAGIC_NUMBER) * scale;
+#else
+            this->elt_[2 * mi + mj][4 * ni + nj] = static_cast<float>(tmp[mj][nj]) * scale;
+#endif  // defined(USE_I2F_EMULATION_TRICK)
+          }
+        }
+      }
+    }
+  }
+
+  // Repack. We could use store/load to match the Smem_tile API. (shared by Ampere IMMA and Ada
+  // QMMA)
+  template <int K, int M, typename Fragment_a_>
+  inline __device__ void pack(Fragment_a_ (&dst)[K][M]) {
+    // We pack N 16x16 acc tiles into K 16x32 tiles for A.
+    // In the 16x16 tile, a thread owns 4 elts per row (4 regs).
+    // In the 16x32 A tile, a thread owns 8 elts per row (2 regs).
+    // Hence we have to pack with a 2:1 ratio.
+    // For N = 1, K is 1: pack 4 values into dst reg 0. Set reg 1 to 0.
+    // For N = 2, K is 1: pack 8 values into dst regs 0, 1.
+    // For N = 3, K is 2: pack 12 values into dst regs (0,0), (0,1), (1,0). Set (1,1) to 0.
+    // For N = 4, K is 2: pack 16 values into dst regs (0,0), (0,1), (1,0), (1,1)
+    // For N = 5, K is 3: pack 20 values into dst regs (0,0), (0,1), (1,0), (1,1), (2,0). Set (2,1)
+    // to 0. For N = 6, K is 3: pack 24 values into dst regs (0,0), (0,1), (1,0), (1,1), (2,0),
+    // (2,1)
+
+    static_assert(K == 3 || K == 2 || K == 1, "");
+
+    float const scale = reinterpret_cast<float const&>(this->params_scale_softmax_);
+
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+      // 1st row - 12 elements per row.
+      float tmp_00 = this->elt_[2 * mi + 0][0] * scale;
+      float tmp_01 = this->elt_[2 * mi + 0][1] * scale;
+      float tmp_02 = this->elt_[2 * mi + 0][2] * scale;
+      float tmp_03 = this->elt_[2 * mi + 0][3] * scale;
+      float tmp_04 = this->elt_[2 * mi + 0][4] * scale;
+      float tmp_05 = this->elt_[2 * mi + 0][5] * scale;
+      float tmp_06 = this->elt_[2 * mi + 0][6] * scale;
+      float tmp_07 = this->elt_[2 * mi + 0][7] * scale;
+      float tmp_08 = this->elt_[2 * mi + 0][8] * scale;
+      float tmp_09 = this->elt_[2 * mi + 0][9] * scale;
+      float tmp_0a = this->elt_[2 * mi + 0][10] * scale;
+      float tmp_0b = this->elt_[2 * mi + 0][11] * scale;
+
+      // 2nd row - 12 elements per row.
+      float tmp_20 = this->elt_[2 * mi + 1][0] * scale;
+      float tmp_21 = this->elt_[2 * mi + 1][1] * scale;
+      float tmp_22 = this->elt_[2 * mi + 1][2] * scale;
+      float tmp_23 = this->elt_[2 * mi + 1][3] * scale;
+      float tmp_24 = this->elt_[2 * mi + 1][4] * scale;
+      float tmp_25 = this->elt_[2 * mi + 1][5] * scale;
+      float tmp_26 = this->elt_[2 * mi + 1][6] * scale;
+      float tmp_27 = this->elt_[2 * mi + 1][7] * scale;
+      float tmp_28 = this->elt_[2 * mi + 1][8] * scale;
+      float tmp_29 = this->elt_[2 * mi + 1][9] * scale;
+      float tmp_2a = this->elt_[2 * mi + 1][10] * scale;
+      float tmp_2b = this->elt_[2 * mi + 1][11] * scale;
+
+      // Pack the first 12 elements to 6 registers of 2 fragments.
+      dst[0][mi].reg(0) = fmha::float4_to_8bitx4<Dst_type>(tmp_00, tmp_01, tmp_02, tmp_03);
+      dst[0][mi].reg(1) = fmha::float4_to_8bitx4<Dst_type>(tmp_20, tmp_21, tmp_22, tmp_23);
+      dst[0][mi].reg(2) = fmha::float4_to_8bitx4<Dst_type>(tmp_04, tmp_05, tmp_06, tmp_07);
+      dst[0][mi].reg(3) = fmha::float4_to_8bitx4<Dst_type>(tmp_24, tmp_25, tmp_26, tmp_27);
+      if (K > 1) {
+        dst[1][mi].reg(0) = fmha::float4_to_8bitx4<Dst_type>(tmp_08, tmp_09, tmp_0a, tmp_0b);
+        dst[1][mi].reg(1) = fmha::float4_to_8bitx4<Dst_type>(tmp_28, tmp_29, tmp_2a, tmp_2b);
+      }
+
+      if (Mma_tile::MMAS_N == 6) {
+        float tmp_0c = this->elt_[2 * mi + 0][12] * scale;
+        float tmp_0d = this->elt_[2 * mi + 0][13] * scale;
+        float tmp_0e = this->elt_[2 * mi + 0][14] * scale;
+        float tmp_0f = this->elt_[2 * mi + 0][15] * scale;
+        float tmp_10 = this->elt_[2 * mi + 0][16] * scale;
+        float tmp_11 = this->elt_[2 * mi + 0][17] * scale;
+        float tmp_12 = this->elt_[2 * mi + 0][18] * scale;
+        float tmp_13 = this->elt_[2 * mi + 0][19] * scale;
+        float tmp_14 = this->elt_[2 * mi + 0][20] * scale;
+        float tmp_15 = this->elt_[2 * mi + 0][21] * scale;
+        float tmp_16 = this->elt_[2 * mi + 0][22] * scale;
+        float tmp_17 = this->elt_[2 * mi + 0][23] * scale;
+
+        float tmp_2c = this->elt_[2 * mi + 1][12] * scale;
+        float tmp_2d = this->elt_[2 * mi + 1][13] * scale;
+        float tmp_2e = this->elt_[2 * mi + 1][14] * scale;
+        float tmp_2f = this->elt_[2 * mi + 1][15] * scale;
+        float tmp_30 = this->elt_[2 * mi + 1][16] * scale;
+        float tmp_31 = this->elt_[2 * mi + 1][17] * scale;
+        float tmp_32 = this->elt_[2 * mi + 1][18] * scale;
+        float tmp_33 = this->elt_[2 * mi + 1][19] * scale;
+        float tmp_34 = this->elt_[2 * mi + 1][20] * scale;
+        float tmp_35 = this->elt_[2 * mi + 1][21] * scale;
+        float tmp_36 = this->elt_[2 * mi + 1][22] * scale;
+        float tmp_37 = this->elt_[2 * mi + 1][23] * scale;
+
+        dst[1][mi].reg(2) = fmha::float4_to_8bitx4<Dst_type>(tmp_0c, tmp_0d, tmp_0e, tmp_0f);
+        dst[1][mi].reg(3) = fmha::float4_to_8bitx4<Dst_type>(tmp_2c, tmp_2d, tmp_2e, tmp_2f);
+        dst[2][mi].reg(0) = fmha::float4_to_8bitx4<Dst_type>(tmp_10, tmp_11, tmp_12, tmp_13);
+        dst[2][mi].reg(1) = fmha::float4_to_8bitx4<Dst_type>(tmp_30, tmp_31, tmp_32, tmp_33);
+        dst[2][mi].reg(2) = fmha::float4_to_8bitx4<Dst_type>(tmp_14, tmp_15, tmp_16, tmp_17);
+        dst[2][mi].reg(3) = fmha::float4_to_8bitx4<Dst_type>(tmp_34, tmp_35, tmp_36, tmp_37);
+      } else if (Mma_tile::MMAS_N == 4) {
+        // SEQLEN == 128.
+        float tmp_0c = this->elt_[2 * mi + 0][12] * scale;
+        float tmp_0d = this->elt_[2 * mi + 0][13] * scale;
+        float tmp_0e = this->elt_[2 * mi + 0][14] * scale;
+        float tmp_0f = this->elt_[2 * mi + 0][15] * scale;
+
+        float tmp_1c = this->elt_[2 * mi + 1][12] * scale;
+        float tmp_1d = this->elt_[2 * mi + 1][13] * scale;
+        float tmp_1e = this->elt_[2 * mi + 1][14] * scale;
+        float tmp_1f = this->elt_[2 * mi + 1][15] * scale;
+
+        dst[1][mi].reg(2) = fmha::float4_to_8bitx4<Dst_type>(tmp_0c, tmp_0d, tmp_0e, tmp_0f);
+        dst[1][mi].reg(3) = fmha::float4_to_8bitx4<Dst_type>(tmp_1c, tmp_1d, tmp_1e, tmp_1f);
+
+        // SEQLEN == 384 or SEQLEN == 256.
+      } else if (Mma_tile::MMAS_N == 3 || Mma_tile::MMAS_N == 2) {
+        // TODO added second OR term for ampere imma s=256: correct?
+        dst[1][mi].reg(2) = 0u;
+        dst[1][mi].reg(3) = 0u;
+      } else if (Mma_tile::MMAS_N == 1) {
+        dst[0][mi].reg(2) = 0u;
+        dst[0][mi].reg(3) = 0u;
+
+        // Not implemented.
+      } else {
+        assert(false);
+      }
+    }
+  }
+
+  // The scaling factors.
+  uint32_t const params_scale_bmm1_, params_scale_softmax_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, typename Kernel_traits>
+struct Softmax_qmma : public Softmax_imma<Traits, Cta_tile, Kernel_traits> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Kernel_traits>
+struct Softmax_qmma<fmha::Ada_qmma_e4m3_fp32_traits, Cta_tile, Kernel_traits>
+    : public Softmax_imma<fmha::Ada_qmma_e4m3_fp32_traits, Cta_tile, Kernel_traits> {
+  // The Traits
+  using Traits = fmha::Ada_qmma_e4m3_fp32_traits;
+  // The base class.
+  using Base = Softmax_imma<Traits, Cta_tile, Kernel_traits>;
+
+  // The MMAs.
+  enum { MMAS_M = Base::MMAS_M };
+
+  enum { MMAS_N = Base::MMAS_N };
+
+  // The accumulators.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax_qmma(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx),
+        params_scale_bmm1_(params.scale_bmm1_d ? *params.scale_bmm1_d : params.scale_bmm1),
+        params_scale_softmax_(params.scale_softmax) {}
+
+  // Store the tile after softmax.
+  template <typename Gmem_tile>
+  inline __device__ void store(Gmem_tile& gmem_tile) {
+    float const scale = reinterpret_cast<float const&>(params_scale_softmax_);
+    Accumulator acc[MMAS_M][MMAS_N];
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // scale
+        acc[mi][ni].ele(0) = this->elt_[2 * mi + 0][4 * ni + 0] * scale;
+        acc[mi][ni].ele(1) = this->elt_[2 * mi + 0][4 * ni + 1] * scale;
+        acc[mi][ni].ele(4) = this->elt_[2 * mi + 0][4 * ni + 2] * scale;
+        acc[mi][ni].ele(5) = this->elt_[2 * mi + 0][4 * ni + 3] * scale;
+        acc[mi][ni].ele(2) = this->elt_[2 * mi + 1][4 * ni + 0] * scale;
+        acc[mi][ni].ele(3) = this->elt_[2 * mi + 1][4 * ni + 1] * scale;
+        acc[mi][ni].ele(6) = this->elt_[2 * mi + 1][4 * ni + 2] * scale;
+        acc[mi][ni].ele(7) = this->elt_[2 * mi + 1][4 * ni + 3] * scale;
+      }
+    }
+
+    // Delegate to the gmem tile to store.
+    // TODO: need fp32 to fp8 conversion (move this to gmem_tile)
+    gmem_tile.store(acc);
+  }
+
+  // Convert from accumulators to floats.
+  inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) {
+    float const scale = reinterpret_cast<float const&>(params_scale_bmm1_);
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // Convert to FP32 and scale.
+        this->elt_[2 * mi + 0][4 * ni + 0] = acc[mi][ni].elt(0) * scale;
+        this->elt_[2 * mi + 0][4 * ni + 1] = acc[mi][ni].elt(1) * scale;
+        this->elt_[2 * mi + 0][4 * ni + 2] = acc[mi][ni].elt(4) * scale;
+        this->elt_[2 * mi + 0][4 * ni + 3] = acc[mi][ni].elt(5) * scale;
+        this->elt_[2 * mi + 1][4 * ni + 0] = acc[mi][ni].elt(2) * scale;
+        this->elt_[2 * mi + 1][4 * ni + 1] = acc[mi][ni].elt(3) * scale;
+        this->elt_[2 * mi + 1][4 * ni + 2] = acc[mi][ni].elt(6) * scale;
+        this->elt_[2 * mi + 1][4 * ni + 3] = acc[mi][ni].elt(7) * scale;
+      }
+    }
+  }
+
+  template <bool APPLY_MASK = false>
+  inline __device__ void apply_exp_with_mask(float const (&max)[MMAS_M * 2]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+      float max_val = APPLY_MASK && max[mi] == -FLT_MAX
+                          ? 0.f
+                          : (max[mi] - logf(Traits::SOFTMAX_FP_QUANT_SCALE));
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * 4; ++ni) {
+        this->elt_[mi][ni] = expf(this->elt_[mi][ni] - max_val);
+      }
+    }
+  }
+
+  // Pack the data to a fragment for the next GEMM.
+  template <typename Fragment_a, int K, int M>
+  inline __device__ void pack(Fragment_a (&dst)[K][M]) const {
+    float const scale = reinterpret_cast<float const&>(this->params_scale_softmax_);
+
+// The canonical layout in K should be R0: [0,1,2,3] R2: [16,17,18,19]
+// Note below that this is not possible with the register layout of the accumulator.
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ki = 0; ki < K; ++ki) {
+        // 1st row - 8 elements per row.
+        float tmp_00 = this->elt_[2 * mi + 0][8 * ki + 0] * scale;  // + 0
+        float tmp_01 = this->elt_[2 * mi + 0][8 * ki + 1] * scale;  // + 1
+        float tmp_02 = this->elt_[2 * mi + 0][8 * ki + 2] * scale;  // + 8
+        float tmp_03 = this->elt_[2 * mi + 0][8 * ki + 3] * scale;  // + 9
+        float tmp_04 = this->elt_[2 * mi + 0][8 * ki + 4] * scale;  // +16
+        float tmp_05 = this->elt_[2 * mi + 0][8 * ki + 5] * scale;  // +17
+        float tmp_06 = this->elt_[2 * mi + 0][8 * ki + 6] * scale;  // +24
+        float tmp_07 = this->elt_[2 * mi + 0][8 * ki + 7] * scale;  // +25
+
+        // 2nd row - 4 elements per row.
+        float tmp_10 = this->elt_[2 * mi + 1][8 * ki + 0] * scale;  // + 0
+        float tmp_11 = this->elt_[2 * mi + 1][8 * ki + 1] * scale;  // + 1
+        float tmp_12 = this->elt_[2 * mi + 1][8 * ki + 2] * scale;  // + 8
+        float tmp_13 = this->elt_[2 * mi + 1][8 * ki + 3] * scale;  // + 9
+        float tmp_14 = this->elt_[2 * mi + 1][8 * ki + 4] * scale;  // +16
+        float tmp_15 = this->elt_[2 * mi + 1][8 * ki + 5] * scale;  // +17
+        float tmp_16 = this->elt_[2 * mi + 1][8 * ki + 6] * scale;  // +24
+        float tmp_17 = this->elt_[2 * mi + 1][8 * ki + 7] * scale;  // +25
+
+        // Pack to 4 registers.
+        dst[ki][mi].reg(0) = fmha::float4_to_fp8x4<Traits::A_type>(tmp_00, tmp_01, tmp_02, tmp_03);
+        dst[ki][mi].reg(1) = fmha::float4_to_fp8x4<Traits::A_type>(tmp_10, tmp_11, tmp_12, tmp_13);
+        dst[ki][mi].reg(2) = fmha::float4_to_fp8x4<Traits::A_type>(tmp_04, tmp_05, tmp_06, tmp_07);
+        dst[ki][mi].reg(3) = fmha::float4_to_fp8x4<Traits::A_type>(tmp_14, tmp_15, tmp_16, tmp_17);
+      }
+    }
+  }
+
+  // The scaling factors.
+  uint32_t const params_scale_bmm1_, params_scale_softmax_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Kernel_traits>
+struct Softmax_qmma<fmha::Ada_qmma_e4m3_fp16_traits, Cta_tile, Kernel_traits>
+    : public Softmax_imma<fmha::Ada_qmma_e4m3_fp16_traits, Cta_tile, Kernel_traits> {
+  // The Traits
+  using Traits = fmha::Ada_qmma_e4m3_fp16_traits;
+  // The base class.
+  using Base = Softmax_imma<Traits, Cta_tile, Kernel_traits>;
+
+  // The MMAs.
+  enum { MMAS_M = Base::MMAS_M };
+
+  enum { MMAS_N = Base::MMAS_N };
+
+  // The accumulators.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax_qmma(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx),
+        params_scale_bmm1_(params.scale_bmm1),
+        params_scale_softmax_(params.scale_softmax) {}
+
+  // Store the tile after softmax.
+  template <typename Gmem_tile>
+  inline __device__ void store(Gmem_tile& gmem_tile) {
+    float const scale = reinterpret_cast<float const&>(params_scale_softmax_);
+    Accumulator acc[MMAS_M][MMAS_N];
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // scale
+        acc[mi][ni].ele(0) = this->elt_[2 * mi + 0][4 * ni + 0] * scale;
+        acc[mi][ni].ele(1) = this->elt_[2 * mi + 0][4 * ni + 1] * scale;
+        acc[mi][ni].ele(4) = this->elt_[2 * mi + 0][4 * ni + 2] * scale;
+        acc[mi][ni].ele(5) = this->elt_[2 * mi + 0][4 * ni + 3] * scale;
+        acc[mi][ni].ele(2) = this->elt_[2 * mi + 1][4 * ni + 0] * scale;
+        acc[mi][ni].ele(3) = this->elt_[2 * mi + 1][4 * ni + 1] * scale;
+        acc[mi][ni].ele(6) = this->elt_[2 * mi + 1][4 * ni + 2] * scale;
+        acc[mi][ni].ele(7) = this->elt_[2 * mi + 1][4 * ni + 3] * scale;
+      }
+    }
+
+    // Delegate to the gmem tile to store.
+    // TODO: need fp32 to fp8 conversion (move this to gmem_tile)
+    gmem_tile.store(acc);
+  }
+
+  // Convert from accumulators to floats.
+  inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // Convert to FP32 and scale.
+        float2* elt_ptr0 = reinterpret_cast<float2*>(this->elt_[2 * mi + 0] + 4 * ni);
+        float2* elt_ptr1 = reinterpret_cast<float2*>(this->elt_[2 * mi + 1] + 4 * ni);
+        elt_ptr0[0] = fmha::half2_to_float2(fmha::hmul2(acc[mi][ni].reg(0), params_scale_bmm1_));
+        elt_ptr0[1] = fmha::half2_to_float2(fmha::hmul2(acc[mi][ni].reg(2), params_scale_bmm1_));
+        elt_ptr1[0] = fmha::half2_to_float2(fmha::hmul2(acc[mi][ni].reg(1), params_scale_bmm1_));
+        elt_ptr1[1] = fmha::half2_to_float2(fmha::hmul2(acc[mi][ni].reg(3), params_scale_bmm1_));
+      }
+    }
+  }
+
+  // The scaling factors.
+  uint32_t const params_scale_bmm1_, params_scale_softmax_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, typename Kernel_traits, bool Sage = false>
+struct Softmax {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Kernel_traits>
+struct Softmax<fmha::Volta_hmma_fp16_traits, Cta_tile, Kernel_traits> {
+  // The traits class.
+  using Traits = fmha::Volta_hmma_fp16_traits;
+  // The Mma tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+  // The accumulators.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+  // The fragment.
+  using Fragment_a = fmha::Fragment_a<Traits, fmha::Row>;
+
+  // Softmax dst data_type (BMM2 input)
+  using Dst_type = typename Traits::A_type;
+
+  // The number of MMAs in M/N dimensions.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  enum { MMAS_N = Mma_tile::MMAS_N };
+
+  // The number of groups of warp such that we have at most 2 warps writing consecutive elements.
+  enum { GROUPS = fmha::Div_up<Cta_tile::WARPS_N, 2>::VALUE };
+
+  // The number of elements that we are going to store per row.
+  enum { ELEMENTS_PER_ROW = Cta_tile::WARPS_N / GROUPS };
+
+  // The number of rows.
+  enum { ROWS = Cta_tile::M * GROUPS };
+
+  // The total number of elements.
+  enum { ELEMENTS = ROWS * ELEMENTS_PER_ROW };
+
+  // Use BMM1 softcapping scale or not.
+  enum { ENABLE_BMM1_SOFTCAPPING_SCALE = Kernel_traits::ENABLE_BMM1_SOFTCAPPING_SCALE };
+
+  // If shared memory is used
+  enum { USE_SHARED_MEMORY = Cta_tile::WARPS_N > 1 };
+
+  // The number of rows per thread.
+  enum { ROWS_PER_THREAD = MMAS_M };
+
+  // DEBUG.
+  static_assert(ELEMENTS == Cta_tile::M * Cta_tile::WARPS_N, "");
+
+  // END OF DEBUG.
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx)
+      : params_scale_bmm1_(params.scale_bmm1),
+        params_softcapping_scale_bmm1_(params.softcapping_scale_bmm1),
+        smem_(reinterpret_cast<float*>(smem)),
+        tidx_(tidx) {
+    // Extract the position in the warp.
+    int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+    // Decompose the warp index into M and N.
+    int warp_m = warp % Cta_tile::WARPS_M;
+    int warp_n = warp / Cta_tile::WARPS_M;
+
+    // Decompose the warp-n index into group/position-inside-the-group.
+    int warp_g = warp_n / ELEMENTS_PER_ROW;
+    int warp_i = warp_n % ELEMENTS_PER_ROW;
+
+    // The row written/read by the thread (threads i and i+8 are on the same row).
+    int row = (lane & 0x10) / 2 + (lane & 0x07);
+
+    // The location written by the threads.
+    int write_row = warp_g * Cta_tile::M + warp_m * Mma_tile::M_PER_MMA + row;
+    int write_col = warp_i;
+
+    // Assemble the write pointer.
+    smem_write_ = &smem_[write_row * ELEMENTS_PER_ROW + write_col];
+    // Assemble the read pointer.
+    smem_read_ = &smem_[warp_m * Mma_tile::M_PER_MMA + row];
+  }
+
+  // Apply mask before softmax. Use 1 byte per MMA distributed as 1x8.
+  template <typename Mask>
+  inline __device__ void apply_mask(Mask const& mask) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < 8; ++ii) {
+          if (!mask.is_valid(mi, ni, 0, ii)) {
+            elt_[mi][8 * ni + ii] = -FLT_MAX;
+          }
+        }
+      }
+    }
+  }
+
+  template <typename Mask, typename AlibiParams>
+  inline __device__ void apply_mask_alibi(Mask const& mask, int head_id,
+                                          AlibiParams const& alibi_params) {
+    // 'if constexpr' because ALiBi is only defined for causal masks
+    if constexpr (Kernel_traits::CAUSAL_MASK) {
+      float m = get_alibi_head_scaling_factor<AlibiParams>(head_id, alibi_params);
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+        for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+          for (int ii = 0; ii < 8; ++ii) {
+            int row, col;
+            mask.get_row_col(row, col, mi, ni, 0, ii);
+            if (mask.is_valid(row, col)) {
+              // Since softmax is shift invariant,
+              //  it is sufficient just to use the column as the multiplier
+              elt_[mi][8 * ni + ii] = elt_[mi][8 * ni + ii] * alibi_params.scale_after_alibi +
+                                      m * (col + alibi_params.sequence_pos_offset);
+            } else {
+              elt_[mi][8 * ni + ii] = -FLT_MAX;
+            }
+          }
+        }
+      }
+    } else {
+      __builtin_unreachable();
+    }
+  }
+
+  // Apply the mask to unpacked data.
+  inline __device__ void apply_mask(uint32_t const (&packed_mask)[MMAS_M]) {
+    // This code works only if we have MMAS_N <= 4.
+    static_assert(MMAS_N <= 4, "");
+
+    // Expand the mask.
+    int mask[MMAS_M][MMAS_N * 8];
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ii = 0; ii < MMAS_N * 8; ++ii) {
+        mask[mi][ii] = packed_mask[mi] & (1u << ii);
+      }
+    }
+
+// Apply the mask.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * 8; ++ni) {
+        if (!mask[mi][ni]) {
+          elt_[mi][ni] = -FLT_MAX;
+        }
+      }
+    }
+  }
+
+  // Mask the elements that are outside the the sequence length.
+  inline __device__ void apply_mask(int const seqlen) {
+    // The warp/lane decomposition.
+    int const warp = threadIdx.x / Cta_tile::THREADS_PER_WARP;
+    int const lane = threadIdx.x % Cta_tile::THREADS_PER_WARP;
+
+    // The warp in the n dimension.
+    int const warp_n = warp / Cta_tile::WARPS_M;
+    // The base position within a quad.
+    int const offset = warp_n * 16 + (threadIdx.x & 0x08) / 2;
+
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // The position in the sequence.
+        int pos = offset + ni * Mma_tile::N_PER_MMA_PER_CTA;
+
+        // Determine the position in the sequence.
+        if (pos + 0 >= seqlen) {
+          elt_[mi][8 * ni + 0] = -FLT_MAX;
+        }
+        if (pos + 1 >= seqlen) {
+          elt_[mi][8 * ni + 1] = -FLT_MAX;
+        }
+        if (pos + 2 >= seqlen) {
+          elt_[mi][8 * ni + 2] = -FLT_MAX;
+        }
+        if (pos + 3 >= seqlen) {
+          elt_[mi][8 * ni + 3] = -FLT_MAX;
+        }
+        if (pos + 8 >= seqlen) {
+          elt_[mi][8 * ni + 4] = -FLT_MAX;
+        }
+        if (pos + 9 >= seqlen) {
+          elt_[mi][8 * ni + 5] = -FLT_MAX;
+        }
+        if (pos + 10 >= seqlen) {
+          elt_[mi][8 * ni + 6] = -FLT_MAX;
+        }
+        if (pos + 11 >= seqlen) {
+          elt_[mi][8 * ni + 7] = -FLT_MAX;
+        }
+      }
+    }
+  }
+
+  // Apply the exp to all the elements.
+  // Need to make sure the results are zero when all elts are -FLT_MAX
+  //  as it is possible that all tokens are masked out.
+  template <bool APPLY_MASK = false>
+  inline __device__ void apply_exp_with_mask(float const (&max)[MMAS_M]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      float max_val = APPLY_MASK && max[mi] == -FLT_MAX ? 0.f : max[mi];
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * 8; ++ni) {
+        this->elt_[mi][ni] = expf(this->elt_[mi][ni] - max_val);
+      }
+    }
+  }
+
+  // Apply the exp to all the elements.
+  inline __device__ void apply_exp(float const max) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * 8; ++ni) {
+        elt_[mi][ni] = apply_exp_<Kernel_traits::VERSION>(elt_[mi][ni], max);
+      }
+    }
+  }
+
+  // Apply the exp to all the elements.
+  inline __device__ void apply_exp(float const (&max)[MMAS_M]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * 8; ++ni) {
+        elt_[mi][ni] = apply_exp_<Kernel_traits::VERSION>(elt_[mi][ni], max[mi]);
+      }
+    }
+  }
+
+  // Pack the data to a fragment for the next GEMM.
+  template <int K, int M>
+  inline __device__ void pack(Fragment_a (&dst)[K][M]) const {
+    static_assert(MMAS_M == M && MMAS_N == K, "");
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ki = 0; ki < K; ++ki) {
+        // 8 elements per row.
+        float tmp_0 = this->elt_[mi][8 * ki + 0];
+        float tmp_1 = this->elt_[mi][8 * ki + 1];
+        float tmp_2 = this->elt_[mi][8 * ki + 2];
+        float tmp_3 = this->elt_[mi][8 * ki + 3];
+        float tmp_4 = this->elt_[mi][8 * ki + 4];
+        float tmp_5 = this->elt_[mi][8 * ki + 5];
+        float tmp_6 = this->elt_[mi][8 * ki + 6];
+        float tmp_7 = this->elt_[mi][8 * ki + 7];
+
+        // Pack to 4 registers.
+        dst[ki][mi].reg(0) = fmha::float2_to_16bit_2<Dst_type>(tmp_0, tmp_1);
+        dst[ki][mi].reg(1) = fmha::float2_to_16bit_2<Dst_type>(tmp_2, tmp_3);
+        dst[ki][mi].reg(2) = fmha::float2_to_16bit_2<Dst_type>(tmp_4, tmp_5);
+        dst[ki][mi].reg(3) = fmha::float2_to_16bit_2<Dst_type>(tmp_6, tmp_7);
+      }
+    }
+  }
+
+  // Do a CTA-wide reduction.
+  template <typename Functor>
+  inline __device__ void reduce_Nx1(float (&dst)[MMAS_M]) {
+#if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+    if (Functor::IS_SUM) {
+// Apply the summation inside the thread for each row.
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M; ++mi) {
+        // The thread local math in the reference code.
+        float sums[MMAS_N * 2];
+#pragma unroll
+        for (int ii = 0; ii < MMAS_N * 2; ++ii) {
+          sums[ii] = elt_[mi][4 * ii + 0];
+          sums[ii] += elt_[mi][4 * ii + 1];
+          sums[ii] += elt_[mi][4 * ii + 2];
+          sums[ii] += elt_[mi][4 * ii + 3];
+        }
+
+// Columns 0 and  8: __shfl( 2).
+#pragma unroll
+        for (int ii = 0; ii < MMAS_N; ++ii) {
+          sums[2 * ii] += sums[2 * ii + 1];
+        }
+
+// Columns 0 and 32: __shfl( 8).
+#pragma unroll
+        for (int ii = 0; ii < MMAS_N / 2; ++ii) {  // MMAS_N / 2 == 0 if MMAS_N <= 1.
+          sums[4 * ii] += sums[4 * ii + 2];
+        }
+
+        // Columns 0 and 64: __shfl(16).
+        if (MMAS_N == 3) {
+          sums[0] += sums[4];
+        } else if (MMAS_N >= 4) {
+#pragma unroll
+          for (int ii = 0; ii < MMAS_N / 4; ++ii) {  // MMAS_N / 4 == 0 if MMAS_N <= 2.
+            sums[8 * ii] += sums[8 * ii + 4];
+          }
+        }
+
+        // Store the final value for that row.
+        dst[mi] = sums[0];
+      }
+    } else
+#endif  // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+    {
+// Apply the functor for each row inside a thread.
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M; ++mi) {
+        dst[mi] = elt_[mi][0];
+#pragma unroll
+        for (int ni = 1; ni < MMAS_N * 8; ++ni) {
+          dst[mi] = Functor::apply(dst[mi], elt_[mi][ni]);
+        }
+      }
+    }
+
+// Apply the functor for each row.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 8));
+    }
+  }
+
+  // Do a CTA-wide reduction.
+  template <typename Functor>
+  inline __device__ float reduce_2x2() {
+    float dst[MMAS_M];
+#if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+    if (Functor::IS_SUM) {
+// Apply the summation inside the thread for each row.
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M; ++mi) {
+        // The thread local math in the reference code.
+        float sums[MMAS_N * 2];
+#pragma unroll
+        for (int ii = 0; ii < MMAS_N * 2; ++ii) {
+          sums[ii] = elt_[mi][4 * ii + 0];
+          sums[ii] += elt_[mi][4 * ii + 1];
+          sums[ii] += elt_[mi][4 * ii + 2];
+          sums[ii] += elt_[mi][4 * ii + 3];
+        }
+
+// Columns 0 and  8: __shfl( 2).
+#pragma unroll
+        for (int ii = 0; ii < MMAS_N; ++ii) {
+          sums[2 * ii] += sums[2 * ii + 1];
+        }
+
+// Columns 0 and 32: __shfl( 8).
+#pragma unroll
+        for (int ii = 0; ii < MMAS_N / 2; ++ii) {  // MMAS_N / 2 == 0 if MMAS_N <= 1.
+          sums[4 * ii] += sums[4 * ii + 2];
+        }
+
+        // Columns 0 and 64: __shfl(16).
+        if (MMAS_N == 3) {
+          sums[0] += sums[4];
+        } else if (MMAS_N >= 4) {
+#pragma unroll
+          for (int ii = 0; ii < MMAS_N / 4; ++ii) {  // MMAS_N / 4 == 0 if MMAS_N <= 2.
+            sums[8 * ii] += sums[8 * ii + 4];
+          }
+        }
+
+        // Store the final value for that row.
+        dst[mi] = sums[0];
+      }
+    } else
+#endif  // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+    {
+// Apply the functor for each row inside a thread.
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M; ++mi) {
+        dst[mi] = elt_[mi][0];
+#pragma unroll
+        for (int ni = 1; ni < MMAS_N * 8; ++ni) {
+          dst[mi] = Functor::apply(dst[mi], elt_[mi][ni]);
+        }
+      }
+    }
+
+// Apply the functor for each row.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 8));
+    }
+
+// Store the different values to shared memory.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      if (tidx_ % 16 < 8) {
+        smem_write_[mi * Mma_tile::M_PER_MMA_PER_CTA * ELEMENTS_PER_ROW] = dst[mi];
+      }
+    }
+
+    // Make sure the values are in shared memory.
+    __syncthreads();
+
+    // Load 2 values (one for each warp).
+    float2 tmp = reinterpret_cast<float2 const*>(smem_)[tidx_];
+
+    // Compute the reduction of those 2 values in a binary-tree fashion.
+    return Functor::apply(tmp.x, tmp.y);
+  }
+
+  // Do a CTA-wide reduction.
+  template <typename Functor>
+  inline __device__ float reduce_1x4() {
+    float dst[MMAS_M];
+#if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+    if (Functor::IS_SUM) {
+// Apply the summation inside the thread for each row.
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M; ++mi) {
+        // The thread local math in the reference code.
+        float sums[MMAS_N * 2];
+#pragma unroll
+        for (int ii = 0; ii < MMAS_N * 2; ++ii) {
+          sums[ii] = elt_[mi][4 * ii + 0];
+          sums[ii] += elt_[mi][4 * ii + 1];
+          sums[ii] += elt_[mi][4 * ii + 2];
+          sums[ii] += elt_[mi][4 * ii + 3];
+        }
+
+        // Columns 0 and 128 (the ref code uses a step of 128). Not needed if SEQLEN <= 128.
+        if (Cta_tile::N > 128) {
+#pragma unroll
+          for (int ii = 0; ii < MMAS_N; ++ii) {
+            sums[ii] += sums[MMAS_N + ii];
+          }
+        }
+
+// Columns 0 and  8: __shfl( 2).
+#pragma unroll
+        for (int ii = 0; ii < MMAS_N; ++ii) {
+          sums[2 * ii] += sums[2 * ii + 1];
+        }
+
+// Columns 0 and 64: __shfl(16).
+#pragma unroll
+        for (int ii = 0; ii < MMAS_N / 2; ++ii) {  // MMAS_N / 2 == 0 if MMAS_N <= 1.
+          sums[4 * ii] += sums[4 * ii + 2];
+        }
+
+        // Store the final value for that row.
+        dst[mi] = sums[0];
+      }
+    } else
+#endif  // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+    {
+// Apply the functor for each row inside a thread.
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M; ++mi) {
+        dst[mi] = elt_[mi][0];
+#pragma unroll
+        for (int ni = 1; ni < MMAS_N * 8; ++ni) {
+          dst[mi] = Functor::apply(dst[mi], elt_[mi][ni]);
+        }
+      }
+    }
+
+// Apply the functor for each row.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 8));
+    }
+
+// Store the different values to shared memory.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      if (tidx_ % 16 < 8) {
+        smem_write_[mi * Mma_tile::M_PER_MMA_PER_CTA * ELEMENTS_PER_ROW] = dst[mi];
+      }
+    }
+
+    // Make sure the values are in shared memory.
+    __syncthreads();
+
+    // Load 4 values (one for each warp).
+    float2 tmp[2];
+    if (tidx_ < Cta_tile::M) {
+      tmp[0] = reinterpret_cast<float2 const*>(&smem_[0 * ELEMENTS / 2])[tidx_];
+      tmp[1] = reinterpret_cast<float2 const*>(&smem_[1 * ELEMENTS / 2])[tidx_];
+    }
+
+    // Compute the reduction of those 4 values in a binary-tree fashion.
+    tmp[0].x = Functor::apply(tmp[0].x, tmp[0].y);
+    tmp[1].x = Functor::apply(tmp[1].x, tmp[1].y);
+    tmp[0].x = Functor::apply(tmp[0].x, tmp[1].x);
+
+    // Return the final reduction.
+    return tmp[0].x;
+  }
+
+  // Do a CTA-wide reduction.
+  template <typename Functor>
+  inline __device__ float reduce_1x8() {
+    float dst[MMAS_M];
+#if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+    if (Functor::IS_SUM) {
+// Apply the summation inside the thread for each row.
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M; ++mi) {
+        // The thread local math in the reference code.
+        float sums[MMAS_N * 2];
+#pragma unroll
+        for (int ii = 0; ii < MMAS_N * 2; ++ii) {
+          sums[ii] = elt_[mi][4 * ii + 0];
+          sums[ii] += elt_[mi][4 * ii + 1];
+          sums[ii] += elt_[mi][4 * ii + 2];
+          sums[ii] += elt_[mi][4 * ii + 3];
+        }
+
+// Columns 0 and 128 (the ref code uses a step of 128). Not needed if SEQLEN <= 128.
+#pragma unroll
+        for (int ii = 1; ii < MMAS_N; ++ii) {
+          sums[0] += sums[2 * ii + 0];
+          sums[1] += sums[2 * ii + 1];
+        }
+
+        // Columns 0 and  8: __shfl( 2).
+        dst[mi] = sums[0] + sums[1];
+      }
+    } else
+#endif  // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+    {
+// Apply the functor for each row inside a thread.
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M; ++mi) {
+        dst[mi] = elt_[mi][0];
+#pragma unroll
+        for (int ni = 1; ni < MMAS_N * 8; ++ni) {
+          dst[mi] = Functor::apply(dst[mi], elt_[mi][ni]);
+        }
+      }
+    }
+
+// Apply the functor for each row.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 8));
+    }
+
+// Store the different values to shared memory.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      if (tidx_ % 16 < 8) {
+        smem_write_[mi * Mma_tile::M_PER_MMA_PER_CTA * ELEMENTS_PER_ROW] = dst[mi];
+      }
+    }
+
+    // Make sure the values are in shared memory.
+    __syncthreads();
+
+    // Load 8 values (one for each warp).
+    float2 tmp[4];
+    if (tidx_ < Cta_tile::M) {
+      tmp[0] = reinterpret_cast<float2 const*>(&smem_[0 * ELEMENTS / 4])[tidx_];
+      tmp[1] = reinterpret_cast<float2 const*>(&smem_[1 * ELEMENTS / 4])[tidx_];
+      tmp[2] = reinterpret_cast<float2 const*>(&smem_[2 * ELEMENTS / 4])[tidx_];
+      tmp[3] = reinterpret_cast<float2 const*>(&smem_[3 * ELEMENTS / 4])[tidx_];
+    }
+
+    // // DEBUG.
+    // if( tidx_ == 0 ) {
+    //     #pragma unroll
+    //     for( int ii = 0; ii < 4; ++ii ) {
+    //         printf("tidx=%3d tmp[%d]=%8.3f %8.3f\n", tidx_, ii, tmp[ii].x, tmp[ii].y);
+    //     }
+    // }
+    // // END OF DEBUG.
+
+    // Compute the reduction of those 8 values in a binary-tree fashion.
+    tmp[0].x = Functor::apply(tmp[0].x, tmp[0].y);
+    tmp[1].x = Functor::apply(tmp[1].x, tmp[1].y);
+    tmp[2].x = Functor::apply(tmp[2].x, tmp[2].y);
+    tmp[3].x = Functor::apply(tmp[3].x, tmp[3].y);
+
+    tmp[0].x = Functor::apply(tmp[0].x, tmp[1].x);
+    tmp[2].x = Functor::apply(tmp[2].x, tmp[3].x);
+
+    tmp[0].x = Functor::apply(tmp[0].x, tmp[2].x);
+
+    // Return the final reduction.
+    return tmp[0].x;
+  }
+
+  // Do a CTA-wide reduction.
+  template <typename Functor>
+  inline __device__ float reduce_() {
+    // The final reduction.
+    float red = 0.f;
+
+    // SEQLEN == 128.
+    if (Cta_tile::WARPS_M == 2 && Cta_tile::WARPS_N == 2) {
+      red = reduce_2x2<Functor>();
+
+      // SEQLEN == 256.
+    } else if (Cta_tile::WARPS_M == 1 && Cta_tile::WARPS_N == 4) {
+      red = reduce_1x4<Functor>();
+
+      // SEQLEN == 256.
+    } else if (Cta_tile::WARPS_M == 1 && Cta_tile::WARPS_N == 8) {
+      red = reduce_1x8<Functor>();
+
+      // Not supported.
+    } else {
+      assert(false);
+    }
+
+    return red;
+  }
+
+  // Finalize the reduction.
+  inline __device__ void shuffle(float (&dst)[MMAS_M], float red) {
+    // Store the value back to shared memory.
+    if (tidx_ < Cta_tile::M) {
+      smem_[tidx_] = red;
+    }
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+
+// Finally read the values.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      dst[mi] = smem_read_[mi * Mma_tile::M_PER_MMA_PER_CTA];
+    }
+
+    // Make sure we are done reading shared memory.
+    __syncthreads();
+  }
+
+  // Do a CTA-wide reduction.
+  template <typename Functor>
+  inline __device__ void reduce(float (&dst)[MMAS_M]) {
+    // NOTE: 1 warp along reduce direction, no syncs
+    if (Cta_tile::WARPS_N == 1) {
+      reduce_Nx1<Functor>(dst);
+    } else {
+      // The result of the reduction. Threads 0..Cta_tile::M-1 own a valid value.
+      float red = reduce_<Functor>();
+
+      // Make sure we can write to shared memory.
+      __syncthreads();
+
+      // Finalize the reduction.
+      shuffle(dst, red);
+    }
+  }
+
+  // Scale all the elements.
+  inline __device__ void scale(float const (&sum)[MMAS_M]) {
+    // Precompute the inverse sum to normalize. Without -use_fast_math, it makes a huge deal.
+    float inv_sum[MMAS_M];
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      inv_sum[mi] = (sum[mi] == 0.f || sum[mi] != sum[mi]) ? 1.f : 1.f / sum[mi];
+    }
+
+// Update the values.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * 8; ++ni) {
+        elt_[mi][ni] *= inv_sum[mi];
+      }
+    }
+  }
+
+  // Store the tile after softmax.
+  template <typename Gmem_tile>
+  inline __device__ void store(Gmem_tile& gmem_tile) {
+    Accumulator acc[MMAS_M][MMAS_N];
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // The elements.
+        float tmp_00 = this->elt_[mi][8 * ni + 0];
+        float tmp_01 = this->elt_[mi][8 * ni + 1];
+        float tmp_02 = this->elt_[mi][8 * ni + 2];
+        float tmp_03 = this->elt_[mi][8 * ni + 3];
+        float tmp_04 = this->elt_[mi][8 * ni + 4];
+        float tmp_05 = this->elt_[mi][8 * ni + 5];
+        float tmp_06 = this->elt_[mi][8 * ni + 6];
+        float tmp_07 = this->elt_[mi][8 * ni + 7];
+
+        // Transform to accumulators.
+        acc[mi][ni].reg(0) = fmha::float2_to_16bit_2<Dst_type>(tmp_00, tmp_01);
+        acc[mi][ni].reg(1) = fmha::float2_to_16bit_2<Dst_type>(tmp_02, tmp_03);
+        acc[mi][ni].reg(2) = fmha::float2_to_16bit_2<Dst_type>(tmp_04, tmp_05);
+        acc[mi][ni].reg(3) = fmha::float2_to_16bit_2<Dst_type>(tmp_06, tmp_07);
+      }
+    }
+
+    // Delegate to the gmem tile to store.
+    gmem_tile.store(acc);
+  }
+
+  // Convert from FP16 fragments to floats.
+  inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // Normalize the values, and clamp to finite half.
+        uint32_t acc_0 = satfinite_h2(hmul2(acc[mi][ni].reg(0), params_scale_bmm1_));
+        uint32_t acc_1 = satfinite_h2(hmul2(acc[mi][ni].reg(1), params_scale_bmm1_));
+        uint32_t acc_2 = satfinite_h2(hmul2(acc[mi][ni].reg(2), params_scale_bmm1_));
+        uint32_t acc_3 = satfinite_h2(hmul2(acc[mi][ni].reg(3), params_scale_bmm1_));
+
+        // Extract the values as floats.
+        half2_to_float2(this->elt_[mi][8 * ni + 0], this->elt_[mi][8 * ni + 1], acc_0);
+        half2_to_float2(this->elt_[mi][8 * ni + 2], this->elt_[mi][8 * ni + 3], acc_1);
+        half2_to_float2(this->elt_[mi][8 * ni + 4], this->elt_[mi][8 * ni + 5], acc_2);
+        half2_to_float2(this->elt_[mi][8 * ni + 6], this->elt_[mi][8 * ni + 7], acc_3);
+
+        if constexpr (ENABLE_BMM1_SOFTCAPPING_SCALE) {
+#pragma unroll
+          for (int i = 0; i < 8; i++) {
+            // 1.0f / softcapping_scale has been fused to scale_bmm1.
+            this->elt_[mi][8 * ni + i] =
+                params_softcapping_scale_bmm1_ * __tanhf(this->elt_[mi][8 * ni + i]);
+          }
+        }
+      }
+    }
+  }
+
+  // The scaling factor.
+  uint32_t const params_scale_bmm1_;
+  float const params_softcapping_scale_bmm1_;
+  // Shared memory for the CTA-wide reduction.
+  float *smem_, *smem_write_, *smem_read_;
+  // The current thread index.
+  int tidx_;
+  // The elements.
+  float elt_[MMAS_M][MMAS_N * 8];
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Kernel_traits>
+struct Softmax<fmha::Turing_hmma_fp16_traits, Cta_tile, Kernel_traits>
+    : public Softmax_hmma<fmha::Turing_hmma_fp16_traits, Cta_tile, Kernel_traits> {
+  // The traits.
+  using Traits = fmha::Turing_hmma_fp16_traits;
+  // The base class.
+  using Base = Softmax_hmma<Traits, Cta_tile, Kernel_traits>;
+  // The fragment.
+  using Fragment_a = fmha::Fragment_a<Traits, fmha::Row>;
+  // Softmax dst data_type (BMM2 input)
+  using Dst_type = typename Traits::A_type;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx) {}
+
+  // Pack the data to a fragment for the next GEMM.
+  template <int K, int M>
+  inline __device__ void pack(Fragment_a (&dst)[K][M]) const {
+    static_assert(Base::Mma_tile::MMAS_M == M && Base::Mma_tile::MMAS_N * 4 == K * 2, "");
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ki = 0; ki < K; ++ki) {
+        // 1st row - 2 elements per row.
+        float tmp_00 = this->elt_[2 * mi + 0][2 * ki + 0];
+        float tmp_01 = this->elt_[2 * mi + 0][2 * ki + 1];
+
+        // 2nd row - 2 elements per row.
+        float tmp_10 = this->elt_[2 * mi + 1][2 * ki + 0];
+        float tmp_11 = this->elt_[2 * mi + 1][2 * ki + 1];
+
+        // Pack to 2 registers.
+        dst[ki][mi].reg(0) = fmha::float2_to_16bit_2<Dst_type>(tmp_00, tmp_01);
+        dst[ki][mi].reg(1) = fmha::float2_to_16bit_2<Dst_type>(tmp_10, tmp_11);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Kernel_traits>
+struct Softmax<fmha::Volta_imma_int8_int32_traits, Cta_tile, Kernel_traits>
+    : public Softmax_imma<fmha::Volta_imma_int8_int32_traits, Cta_tile, Kernel_traits> {
+  // The traits.
+  using Traits = fmha::Volta_imma_int8_int32_traits;
+  // The base class.
+  using Base = Softmax_imma<Traits, Cta_tile, Kernel_traits>;
+  // The fragment.
+  using Fragment_a = fmha::Fragment_a<Traits, fmha::Row>;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx) {}
+
+  // Repack. We could use store/load to match the Smem_tile API.
+  template <int K, int M>
+  inline __device__ void pack(Fragment_a (&dst)[K][M]) {
+    static_assert(Base::Mma_tile::MMAS_M == M && Base::Mma_tile::MMAS_N == K, "");
+    float const scale = reinterpret_cast<float const&>(this->params_scale_softmax_);
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ki = 0; ki < K; ++ki) {
+        // 1st row - 4 elements per row.
+        float tmp_00 = this->elt_[2 * mi + 0][4 * ki + 0] * scale;
+        float tmp_01 = this->elt_[2 * mi + 0][4 * ki + 1] * scale;
+        float tmp_02 = this->elt_[2 * mi + 0][4 * ki + 2] * scale;
+        float tmp_03 = this->elt_[2 * mi + 0][4 * ki + 3] * scale;
+
+        // 2nd row - 4 elements per row.
+        float tmp_10 = this->elt_[2 * mi + 1][4 * ki + 0] * scale;
+        float tmp_11 = this->elt_[2 * mi + 1][4 * ki + 1] * scale;
+        float tmp_12 = this->elt_[2 * mi + 1][4 * ki + 2] * scale;
+        float tmp_13 = this->elt_[2 * mi + 1][4 * ki + 3] * scale;
+
+        // Pack to 2 registers.
+        dst[ki][mi].reg(0) = float4_to_char4<false>(tmp_00, tmp_01, tmp_02, tmp_03);
+        dst[ki][mi].reg(1) = float4_to_char4<false>(tmp_10, tmp_11, tmp_12, tmp_13);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Kernel_traits>
+struct Softmax<fmha::Turing_imma_int8_int32_traits, Cta_tile, Kernel_traits>
+    : public Softmax_imma<fmha::Turing_imma_int8_int32_traits, Cta_tile, Kernel_traits> {
+  // The traits.
+  using Traits = fmha::Turing_imma_int8_int32_traits;
+  // The base class.
+  using Base = Softmax_imma<Traits, Cta_tile, Kernel_traits>;
+  // The fragment.
+  using Fragment_a = fmha::Fragment_a<Traits, fmha::Row>;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx) {}
+
+  // Repack. We could use store/load to match the Smem_tile API.
+  template <int K, int M>
+  inline __device__ void pack(Fragment_a (&dst)[K][M]) {
+    static_assert(Base::Mma_tile::MMAS_M == M && Base::Mma_tile::MMAS_N == K, "");
+    float const scale = reinterpret_cast<float const&>(this->params_scale_softmax_);
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ki = 0; ki < K; ++ki) {
+        // 1st row - 4 elements per row.
+        float tmp_00 = this->elt_[2 * mi + 0][4 * ki + 0] * scale;
+        float tmp_01 = this->elt_[2 * mi + 0][4 * ki + 1] * scale;
+        float tmp_02 = this->elt_[2 * mi + 0][4 * ki + 2] * scale;
+        float tmp_03 = this->elt_[2 * mi + 0][4 * ki + 3] * scale;
+
+        // 2nd row - 4 elements per row.
+        float tmp_10 = this->elt_[2 * mi + 1][4 * ki + 0] * scale;
+        float tmp_11 = this->elt_[2 * mi + 1][4 * ki + 1] * scale;
+        float tmp_12 = this->elt_[2 * mi + 1][4 * ki + 2] * scale;
+        float tmp_13 = this->elt_[2 * mi + 1][4 * ki + 3] * scale;
+
+        // Pack to 2 registers.
+        dst[ki][mi].reg(0) = float4_to_char4<false>(tmp_00, tmp_01, tmp_02, tmp_03);
+        dst[ki][mi].reg(1) = float4_to_char4<false>(tmp_10, tmp_11, tmp_12, tmp_13);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Kernel_traits>
+struct Softmax<fmha::Ampere_hmma_fp16_traits, Cta_tile, Kernel_traits>
+    : public Softmax_hmma<fmha::Ampere_hmma_fp16_traits, Cta_tile, Kernel_traits> {
+  // The traits.
+  using Traits = fmha::Ampere_hmma_fp16_traits;
+  // The base class.
+  using Base = Softmax_hmma<Traits, Cta_tile, Kernel_traits>;
+  // The fragment.
+  using Fragment_a = fmha::Fragment_a<Traits, fmha::Row>;
+  // Softmax dst data_type (BMM2 input)
+  using Dst_type = typename Traits::A_type;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx) {}
+
+  // Pack the data to a fragment for the next GEMM.
+  template <int K, int M>
+  inline __device__ void pack(Fragment_a (&dst)[K][M]) const {
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ki = 0; ki < K; ++ki) {
+        // 1st row - 4 elements per row.
+        float tmp_00 = this->elt_[2 * mi + 0][4 * ki + 0];
+        float tmp_01 = this->elt_[2 * mi + 0][4 * ki + 1];
+        float tmp_02 = this->elt_[2 * mi + 0][4 * ki + 2];
+        float tmp_03 = this->elt_[2 * mi + 0][4 * ki + 3];
+
+        // 2nd row - 4 elements per row.
+        float tmp_10 = this->elt_[2 * mi + 1][4 * ki + 0];
+        float tmp_11 = this->elt_[2 * mi + 1][4 * ki + 1];
+        float tmp_12 = this->elt_[2 * mi + 1][4 * ki + 2];
+        float tmp_13 = this->elt_[2 * mi + 1][4 * ki + 3];
+
+        // Pack to 4 registers.
+        dst[ki][mi].reg(0) = fmha::float2_to_16bit_2<Dst_type>(tmp_00, tmp_01);
+        dst[ki][mi].reg(1) = fmha::float2_to_16bit_2<Dst_type>(tmp_10, tmp_11);
+        dst[ki][mi].reg(2) = fmha::float2_to_16bit_2<Dst_type>(tmp_02, tmp_03);
+        dst[ki][mi].reg(3) = fmha::float2_to_16bit_2<Dst_type>(tmp_12, tmp_13);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, typename Kernel_traits>
+struct Softmax_fp32 : public Softmax_hmma<Traits, Cta_tile, Kernel_traits> {
+  // The base class.
+  using Base = Softmax_hmma<Traits, Cta_tile, Kernel_traits>;
+  // The fragment.
+  using Fragment_a = fmha::Fragment_a<Traits, fmha::Row>;
+
+  // The MMAs.
+  enum { MMAS_M = Base::MMAS_M };
+
+  enum { MMAS_N = Base::MMAS_N };
+
+  // The accumulators.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+  // Output accumulators (after conversion).
+  using Accumulator_out = fmha::Fragment_accumulator<Ampere_hmma_fp16_traits>;
+
+  // Softmax dst data_type (BMM2 input)
+  using Dst_type = typename Traits::A_type;
+
+  // DEBUG.
+  static_assert(Accumulator_out::NUM_REGS == 4, "");
+  // END OF DEBUG.
+
+  // DEBUG.
+  static_assert(std::is_same<typename Accumulator::Data_type, float>::value, "");
+
+  // END OF DEBUG.
+
+  enum { WARPS_M = Cta_tile::WARPS_M };
+
+  enum { WARPS_N = Cta_tile::WARPS_N };
+
+  // Use BMM1 softcapping scale or not.
+  enum { ENABLE_BMM1_SOFTCAPPING_SCALE = Kernel_traits::ENABLE_BMM1_SOFTCAPPING_SCALE };
+
+  using Smem_tile_red = Smem_tile_reduce<Traits, Cta_tile, Kernel_traits>;
+  static_assert(Smem_tile_red::ELTS_PER_TILE == Cta_tile::M * WARPS_N);
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax_fp32(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx),
+        smem_sum_(static_cast<float*>(smem), tidx),
+        smem_max_(static_cast<float*>(smem) + Smem_tile_red::ELTS_PER_TILE, tidx) {}
+
+  // Store the tile after softmax.
+  template <typename Gmem_tile>
+  inline __device__ void store(Gmem_tile& gmem_tile) {
+    Accumulator_out acc[MMAS_M][MMAS_N];
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // The elements.
+        float tmp_00 = this->elt_[2 * mi + 0][4 * ni + 0];
+        float tmp_01 = this->elt_[2 * mi + 0][4 * ni + 1];
+        float tmp_02 = this->elt_[2 * mi + 0][4 * ni + 2];
+        float tmp_03 = this->elt_[2 * mi + 0][4 * ni + 3];
+        float tmp_10 = this->elt_[2 * mi + 1][4 * ni + 0];
+        float tmp_11 = this->elt_[2 * mi + 1][4 * ni + 1];
+        float tmp_12 = this->elt_[2 * mi + 1][4 * ni + 2];
+        float tmp_13 = this->elt_[2 * mi + 1][4 * ni + 3];
+
+        // Transform to accumulators.
+        acc[mi][ni].reg(0) = fmha::float2_to_16bit_2<Dst_type>(tmp_00, tmp_01);
+        acc[mi][ni].reg(1) = fmha::float2_to_16bit_2<Dst_type>(tmp_10, tmp_11);
+        acc[mi][ni].reg(2) = fmha::float2_to_16bit_2<Dst_type>(tmp_02, tmp_03);
+        acc[mi][ni].reg(3) = fmha::float2_to_16bit_2<Dst_type>(tmp_12, tmp_13);
+      }
+    }
+
+    // Delegate to the gmem tile to store.
+    gmem_tile.store(acc);
+  }
+
+  // Pack the data to a fragment for the next GEMM.
+  template <int K, int M>
+  inline __device__ void pack(Fragment_a (&dst)[K][M]) const {
+    static_assert(Fragment_a::NUM_REGS == 4, "");
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ki = 0; ki < K; ++ki) {
+        // 1st row - 4 elements per row.
+        float tmp_00 = this->elt_[2 * mi + 0][4 * ki + 0];
+        float tmp_01 = this->elt_[2 * mi + 0][4 * ki + 1];
+        float tmp_02 = this->elt_[2 * mi + 0][4 * ki + 2];
+        float tmp_03 = this->elt_[2 * mi + 0][4 * ki + 3];
+
+        // 2nd row - 4 elements per row.
+        float tmp_10 = this->elt_[2 * mi + 1][4 * ki + 0];
+        float tmp_11 = this->elt_[2 * mi + 1][4 * ki + 1];
+        float tmp_12 = this->elt_[2 * mi + 1][4 * ki + 2];
+        float tmp_13 = this->elt_[2 * mi + 1][4 * ki + 3];
+
+        // Pack to 4 registers.
+        dst[ki][mi].reg(0) = fmha::float2_to_16bit_2<Dst_type>(tmp_00, tmp_01);
+        dst[ki][mi].reg(1) = fmha::float2_to_16bit_2<Dst_type>(tmp_10, tmp_11);
+        dst[ki][mi].reg(2) = fmha::float2_to_16bit_2<Dst_type>(tmp_02, tmp_03);
+        dst[ki][mi].reg(3) = fmha::float2_to_16bit_2<Dst_type>(tmp_12, tmp_13);
+      }
+    }
+  }
+
+  // Pack the data to a uint4 for the next operation.
+  template <int M, int N>
+  inline __device__ void pack(uint4 (&dst)[M][N]) const {
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < N; ++ni) {
+        // 1st row - 4 elements per row.
+        float tmp_00 = this->elt_[2 * mi + 0][4 * ni + 0];
+        float tmp_01 = this->elt_[2 * mi + 0][4 * ni + 1];
+        float tmp_02 = this->elt_[2 * mi + 0][4 * ni + 2];
+        float tmp_03 = this->elt_[2 * mi + 0][4 * ni + 3];
+
+        // 2nd row - 4 elements per row.
+        float tmp_10 = this->elt_[2 * mi + 1][4 * ni + 0];
+        float tmp_11 = this->elt_[2 * mi + 1][4 * ni + 1];
+        float tmp_12 = this->elt_[2 * mi + 1][4 * ni + 2];
+        float tmp_13 = this->elt_[2 * mi + 1][4 * ni + 3];
+
+        // Pack to 4 registers.
+        dst[mi][ni].x = fmha::float2_to_16bit_2<Dst_type>(tmp_00, tmp_01);
+        dst[mi][ni].y = fmha::float2_to_16bit_2<Dst_type>(tmp_02, tmp_03);
+        dst[mi][ni].z = fmha::float2_to_16bit_2<Dst_type>(tmp_10, tmp_11);
+        dst[mi][ni].w = fmha::float2_to_16bit_2<Dst_type>(tmp_12, tmp_13);
+      }
+    }
+  }
+
+  // Scale FP32 fragments
+  inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) {
+    float const scalef = reinterpret_cast<float const&>(this->params_scale_bmm1_);
+
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // 1st row - 4 elements per row.
+        this->elt_[2 * mi + 0][4 * ni + 0] = acc[mi][ni].elt(0) * scalef;
+        this->elt_[2 * mi + 0][4 * ni + 1] = acc[mi][ni].elt(1) * scalef;
+        this->elt_[2 * mi + 0][4 * ni + 2] = acc[mi][ni].elt(4) * scalef;
+        this->elt_[2 * mi + 0][4 * ni + 3] = acc[mi][ni].elt(5) * scalef;
+        // 2nd row - 4 elements per row.
+        this->elt_[2 * mi + 1][4 * ni + 0] = acc[mi][ni].elt(2) * scalef;
+        this->elt_[2 * mi + 1][4 * ni + 1] = acc[mi][ni].elt(3) * scalef;
+        this->elt_[2 * mi + 1][4 * ni + 2] = acc[mi][ni].elt(6) * scalef;
+        this->elt_[2 * mi + 1][4 * ni + 3] = acc[mi][ni].elt(7) * scalef;
+
+        // Attention logit softcapping scale.
+        // 1.0f / softcapping_scale has been fused to scale_bmm1.
+        if constexpr (ENABLE_BMM1_SOFTCAPPING_SCALE) {
+          this->elt_[2 * mi + 0][4 * ni + 0] =
+              this->params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 0][4 * ni + 0]);
+          this->elt_[2 * mi + 0][4 * ni + 1] =
+              this->params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 0][4 * ni + 1]);
+          this->elt_[2 * mi + 1][4 * ni + 0] =
+              this->params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 1][4 * ni + 0]);
+          this->elt_[2 * mi + 1][4 * ni + 1] =
+              this->params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 1][4 * ni + 1]);
+          this->elt_[2 * mi + 0][4 * ni + 2] =
+              this->params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 0][4 * ni + 2]);
+          this->elt_[2 * mi + 0][4 * ni + 3] =
+              this->params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 0][4 * ni + 3]);
+          this->elt_[2 * mi + 1][4 * ni + 2] =
+              this->params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 1][4 * ni + 2]);
+          this->elt_[2 * mi + 1][4 * ni + 3] =
+              this->params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 1][4 * ni + 3]);
+        }
+      }
+    }
+  }
+
+  // Scale FP32 fragments
+  inline __device__ void unpack_noscale(Accumulator const (&acc)[MMAS_M][MMAS_N]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // 1st row - 4 elements per row.
+        this->elt_[2 * mi + 0][4 * ni + 0] = acc[mi][ni].elt(0);
+        this->elt_[2 * mi + 0][4 * ni + 1] = acc[mi][ni].elt(1);
+        this->elt_[2 * mi + 0][4 * ni + 2] = acc[mi][ni].elt(4);
+        this->elt_[2 * mi + 0][4 * ni + 3] = acc[mi][ni].elt(5);
+        // 2nd row - 4 elements per row.
+        this->elt_[2 * mi + 1][4 * ni + 0] = acc[mi][ni].elt(2);
+        this->elt_[2 * mi + 1][4 * ni + 1] = acc[mi][ni].elt(3);
+        this->elt_[2 * mi + 1][4 * ni + 2] = acc[mi][ni].elt(6);
+        this->elt_[2 * mi + 1][4 * ni + 3] = acc[mi][ni].elt(7);
+      }
+    }
+  }
+
+  template <typename Operator>
+  __device__ inline void reduce_(float (&frag)[2 * MMAS_M], Operator& op, Smem_tile_red& smem_red) {
+#pragma unroll
+    for (int mi = 0; mi < 2 * MMAS_M; mi++) {
+      frag[mi] = this->elt_[mi][0];
+#pragma unroll
+      for (int ni = 1; ni < 4 * MMAS_N; ni++) {
+        frag[mi] = op(frag[mi], this->elt_[mi][ni]);
+      }
+    }
+    quad_reduce(frag, frag, op);
+
+    if (WARPS_N > 1) {
+      smem_red.store(frag);
+      __syncthreads();
+      typename Smem_tile_red::read_t tmp[2 * MMAS_M];
+      smem_red.load(tmp);
+
+      quad_allreduce(frag, tmp, op);
+    }
+  }
+
+  __device__ inline void reduce_max(float (&frag)[2 * MMAS_M]) {
+    MaxOp<float> max;
+    reduce_(frag, max, smem_max_);
+  }
+
+  __device__ inline void reduce_sum(float (&frag)[2 * MMAS_M]) {
+    SumOp<float> sum;
+    reduce_(frag, sum, smem_sum_);
+  }
+
+  __device__ inline float correct(float warp_sum, float warp_max, float max) {
+    return warp_sum * __expf(warp_max - max);
+  }
+
+  __device__ inline float2 correct(float2 warp_sum, float2 warp_max, float max) {
+    return {correct(warp_sum.x, warp_max.x, max), correct(warp_sum.y, warp_max.y, max)};
+  }
+
+  __device__ inline void online_softmax() {
+    MaxOp<float> maxOp;
+    SumOp<float> sumOp;
+    float max[2 * MMAS_M];
+#pragma unroll
+    for (int mi = 0; mi < 2 * MMAS_M; mi++) {
+      max[mi] = this->elt_[mi][0];
+#pragma unroll
+      for (int ni = 1; ni < 4 * MMAS_N; ni++) {
+        max[mi] = maxOp(max[mi], this->elt_[mi][ni]);
+      }
+    }
+    quad_allreduce(max, max, maxOp);
+    smem_max_.store(max);
+    float sum[2 * MMAS_M];
+#pragma unroll
+    for (int mi = 0; mi < 2 * MMAS_M; mi++) {
+      sum[mi] = 0.f;
+#pragma unroll
+      for (int ni = 0; ni < 4 * MMAS_N; ni++) {
+        float x = this->elt_[mi][ni];
+        this->elt_[mi][ni] = __expf(x - max[mi]);
+        sum[mi] += this->elt_[mi][ni];
+      }
+    }
+    quad_allreduce(sum, sum, sumOp);
+    smem_sum_.store(sum);
+
+    __syncthreads();
+
+    typename Smem_tile_red::read_t tmp_max[2 * MMAS_M];
+    typename Smem_tile_red::read_t tmp_sum[2 * MMAS_M];
+    smem_max_.load(tmp_max);
+    smem_sum_.load(tmp_sum);
+    float full_max[2 * MMAS_M];
+    quad_allreduce(full_max, tmp_max, maxOp);
+#pragma unroll
+    for (int mi = 0; mi < 2 * MMAS_M; mi++) {
+      tmp_sum[mi] = correct(tmp_sum[mi], tmp_max[mi], full_max[mi]);
+    }
+    quad_allreduce(sum, tmp_sum, sumOp);
+#pragma unroll
+    for (int mi = 0; mi < 2 * MMAS_M; mi++) {
+      float correction = __expf(max[mi] - full_max[mi]) / sum[mi];
+#pragma unroll
+      for (int ni = 0; ni < 4 * MMAS_N; ni++) {
+        this->elt_[mi][ni] *= correction;
+      }
+    }
+  }
+
+  Smem_tile_red smem_max_;
+  Smem_tile_red smem_sum_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Kernel_traits>
+struct Softmax<fmha::Ampere_hmma_fp32_traits, Cta_tile, Kernel_traits>
+    : public Softmax_fp32<fmha::Ampere_hmma_fp32_traits, Cta_tile, Kernel_traits> {
+  // The traits.
+  using Traits = fmha::Ampere_hmma_fp32_traits;
+  // The base class.
+  using Base = Softmax_fp32<Traits, Cta_tile, Kernel_traits>;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Kernel_traits>
+struct Softmax<fmha::Turing_hmma_fp32_traits, Cta_tile, Kernel_traits>
+    : public Softmax_fp32<fmha::Turing_hmma_fp32_traits, Cta_tile, Kernel_traits> {
+  // The traits.
+  using Traits = fmha::Turing_hmma_fp32_traits;
+  // The base class.
+  using Base = Softmax_fp32<Traits, Cta_tile, Kernel_traits>;
+  // The fragment.
+  using Fragment_a = fmha::Fragment_a<Traits, fmha::Row>;
+  // Softmax dst data_type (BMM2 input)
+  using Dst_type = typename Traits::A_type;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx) {}
+
+  // Pack the data to a fragment for the next GEMM.
+  template <int K, int M>
+  inline __device__ void pack(Fragment_a (&dst)[K][M]) const {
+    static_assert(Fragment_a::NUM_REGS == 2, "");
+    static_assert(Base::Mma_tile::MMAS_M == M && Base::Mma_tile::MMAS_N * 4 == K * 2, "");
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ki = 0; ki < K; ++ki) {
+        // 1st row - 2 elements per row.
+        float tmp_00 = this->elt_[2 * mi + 0][2 * ki + 0];
+        float tmp_01 = this->elt_[2 * mi + 0][2 * ki + 1];
+
+        // 2nd row - 2 elements per row.
+        float tmp_10 = this->elt_[2 * mi + 1][2 * ki + 0];
+        float tmp_11 = this->elt_[2 * mi + 1][2 * ki + 1];
+
+        // Pack to 2 registers.
+        dst[ki][mi].reg(0) = fmha::float2_to_16bit_2<Dst_type>(tmp_00, tmp_01);
+        dst[ki][mi].reg(1) = fmha::float2_to_16bit_2<Dst_type>(tmp_10, tmp_11);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Kernel_traits>
+struct Softmax<fmha::Ampere_hmma_bf16_traits, Cta_tile, Kernel_traits>
+    : public Softmax_fp32<fmha::Ampere_hmma_bf16_traits, Cta_tile, Kernel_traits> {
+  // The traits.
+  using Traits = fmha::Ampere_hmma_bf16_traits;
+  // The base class.
+  using Base = Softmax_fp32<Traits, Cta_tile, Kernel_traits>;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Kernel_traits>
+struct Softmax<fmha::Ampere_imma_int8_int32_traits, Cta_tile, Kernel_traits>
+    : public Softmax_imma<fmha::Ampere_imma_int8_int32_traits, Cta_tile, Kernel_traits> {
+  // The traits.
+  using Traits = fmha::Ampere_imma_int8_int32_traits;
+  // The base class.
+  using Base = Softmax_imma<Traits, Cta_tile, Kernel_traits>;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Kernel_traits>
+struct Softmax<fmha::Ada_qmma_e4m3_fp32_traits, Cta_tile, Kernel_traits>
+    : public Softmax_qmma<fmha::Ada_qmma_e4m3_fp32_traits, Cta_tile, Kernel_traits> {
+  // The traits.
+  using Traits = fmha::Ada_qmma_e4m3_fp32_traits;
+  // The base class.
+  using Base = Softmax_qmma<Traits, Cta_tile, Kernel_traits>;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Kernel_traits>
+struct Softmax<fmha::Ada_qmma_e4m3_fp16_traits, Cta_tile, Kernel_traits>
+    : public Softmax_qmma<fmha::Ada_qmma_e4m3_fp16_traits, Cta_tile, Kernel_traits> {
+  // The traits.
+  using Traits = fmha::Ada_qmma_e4m3_fp16_traits;
+  // The base class.
+  using Base = Softmax_qmma<Traits, Cta_tile, Kernel_traits>;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Kernel_traits>
+struct Softmax<fmha::Ada_qmma_e4m3_fp32_traits, Cta_tile, Kernel_traits, true>
+    : public Softmax_imma<fmha::Ada_qmma_e4m3_fp32_traits, Cta_tile, Kernel_traits> {
+  // The Traits
+  using Traits = fmha::Ada_qmma_e4m3_fp32_traits;
+  // The base class.
+  using Base = Softmax_imma<Traits, Cta_tile, Kernel_traits>;
+
+  // The MMAs.
+  enum { MMAS_M = Base::MMAS_M };
+
+  enum { MMAS_N = Base::MMAS_N };
+
+  // The accumulators.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx),
+        params_scale_bmm1_(params.scale_bmm1_d ? *params.scale_bmm1_d : params.scale_bmm1),
+        params_scale_softmax_(params.scale_softmax) {}
+
+  // Store the tile after softmax.
+  template <typename Gmem_tile>
+  inline __device__ void store(Gmem_tile& gmem_tile) {
+    float const scale = reinterpret_cast<float const&>(params_scale_softmax_);
+    Accumulator acc[MMAS_M][MMAS_N];
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // scale
+        acc[mi][ni].ele(0) = this->elt_[2 * mi + 0][4 * ni + 0] * scale;
+        acc[mi][ni].ele(1) = this->elt_[2 * mi + 0][4 * ni + 1] * scale;
+        acc[mi][ni].ele(4) = this->elt_[2 * mi + 0][4 * ni + 2] * scale;
+        acc[mi][ni].ele(5) = this->elt_[2 * mi + 0][4 * ni + 3] * scale;
+        acc[mi][ni].ele(2) = this->elt_[2 * mi + 1][4 * ni + 0] * scale;
+        acc[mi][ni].ele(3) = this->elt_[2 * mi + 1][4 * ni + 1] * scale;
+        acc[mi][ni].ele(6) = this->elt_[2 * mi + 1][4 * ni + 2] * scale;
+        acc[mi][ni].ele(7) = this->elt_[2 * mi + 1][4 * ni + 3] * scale;
+      }
+    }
+
+    // Delegate to the gmem tile to store.
+    // TODO: need fp32 to fp8 conversion (move this to gmem_tile)
+    gmem_tile.store(acc);
+  }
+
+  // Convert from accumulators to floats.
+  inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) {
+    float const scale = params_scale_q_ * params_scale_k_;
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // Convert to FP32 and scale.
+        this->elt_[2 * mi + 0][4 * ni + 0] = acc[mi][ni].elt(0) * scale;
+        this->elt_[2 * mi + 0][4 * ni + 1] = acc[mi][ni].elt(1) * scale;
+        this->elt_[2 * mi + 0][4 * ni + 2] = acc[mi][ni].elt(4) * scale;
+        this->elt_[2 * mi + 0][4 * ni + 3] = acc[mi][ni].elt(5) * scale;
+        this->elt_[2 * mi + 1][4 * ni + 0] = acc[mi][ni].elt(2) * scale;
+        this->elt_[2 * mi + 1][4 * ni + 1] = acc[mi][ni].elt(3) * scale;
+        this->elt_[2 * mi + 1][4 * ni + 2] = acc[mi][ni].elt(6) * scale;
+        this->elt_[2 * mi + 1][4 * ni + 3] = acc[mi][ni].elt(7) * scale;
+      }
+    }
+  }
+
+  template <bool APPLY_MASK = false>
+  inline __device__ void apply_exp_with_mask(float const (&max)[MMAS_M * 2]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+      float max_val = APPLY_MASK && max[mi] == -FLT_MAX
+                          ? 0.f
+                          : (max[mi] - logf(Traits::SOFTMAX_FP_QUANT_SCALE));
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * 4; ++ni) {
+        this->elt_[mi][ni] = expf(this->elt_[mi][ni] - max_val);
+      }
+    }
+  }
+
+  // Pack the data to a fragment for the next GEMM.
+  template <typename Fragment_a, int K, int M>
+  inline __device__ void pack(Fragment_a (&dst)[K][M]) const {
+    float const scale = reinterpret_cast<float const&>(this->params_scale_softmax_);
+
+// The canonical layout in K should be R0: [0,1,2,3] R2: [16,17,18,19]
+// Note below that this is not possible with the register layout of the accumulator.
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ki = 0; ki < K; ++ki) {
+        // 1st row - 8 elements per row.
+        float tmp_00 = this->elt_[2 * mi + 0][8 * ki + 0] * scale;  // + 0
+        float tmp_01 = this->elt_[2 * mi + 0][8 * ki + 1] * scale;  // + 1
+        float tmp_02 = this->elt_[2 * mi + 0][8 * ki + 2] * scale;  // + 8
+        float tmp_03 = this->elt_[2 * mi + 0][8 * ki + 3] * scale;  // + 9
+        float tmp_04 = this->elt_[2 * mi + 0][8 * ki + 4] * scale;  // +16
+        float tmp_05 = this->elt_[2 * mi + 0][8 * ki + 5] * scale;  // +17
+        float tmp_06 = this->elt_[2 * mi + 0][8 * ki + 6] * scale;  // +24
+        float tmp_07 = this->elt_[2 * mi + 0][8 * ki + 7] * scale;  // +25
+
+        // 2nd row - 4 elements per row.
+        float tmp_10 = this->elt_[2 * mi + 1][8 * ki + 0] * scale;  // + 0
+        float tmp_11 = this->elt_[2 * mi + 1][8 * ki + 1] * scale;  // + 1
+        float tmp_12 = this->elt_[2 * mi + 1][8 * ki + 2] * scale;  // + 8
+        float tmp_13 = this->elt_[2 * mi + 1][8 * ki + 3] * scale;  // + 9
+        float tmp_14 = this->elt_[2 * mi + 1][8 * ki + 4] * scale;  // +16
+        float tmp_15 = this->elt_[2 * mi + 1][8 * ki + 5] * scale;  // +17
+        float tmp_16 = this->elt_[2 * mi + 1][8 * ki + 6] * scale;  // +24
+        float tmp_17 = this->elt_[2 * mi + 1][8 * ki + 7] * scale;  // +25
+
+        // Pack to 4 registers.
+        dst[ki][mi].reg(0) = fmha::float4_to_fp8x4<Traits::A_type>(tmp_00, tmp_01, tmp_02, tmp_03);
+        dst[ki][mi].reg(1) = fmha::float4_to_fp8x4<Traits::A_type>(tmp_10, tmp_11, tmp_12, tmp_13);
+        dst[ki][mi].reg(2) = fmha::float4_to_fp8x4<Traits::A_type>(tmp_04, tmp_05, tmp_06, tmp_07);
+        dst[ki][mi].reg(3) = fmha::float4_to_fp8x4<Traits::A_type>(tmp_14, tmp_15, tmp_16, tmp_17);
+      }
+    }
+  }
+
+  template <typename Params>
+  inline __device__ void move_to_first_block(Params const& params, int bidb, int bidh, int q_loop) {
+    int scale_q_iter =
+        bidb * params.h * params.sage.q.max_nblock + bidh * params.sage.q.max_nblock + q_loop;
+    params_scale_q_ = __ldg(params.sage.q.scales + scale_q_iter);
+    params_scale_q_ *= reinterpret_cast<float const&>(params_scale_bmm1_);
+
+    int scale_k_iter = bidb * params.h * params.sage.k.max_nblock + bidh * params.sage.k.max_nblock;
+    params_scale_k_iter = reinterpret_cast<float const*>(params.sage.k.scales + scale_k_iter);
+    params_scale_k_ = __ldg(params_scale_k_iter);
+  }
+
+  inline __device__ void move_to_next_block() {
+    params_scale_k_iter += 1;
+    params_scale_k_ = __ldg(params_scale_k_iter);
+  }
+
+  // The scaling factors.
+  uint32_t const params_scale_bmm1_, params_scale_softmax_;
+  float params_scale_q_, params_scale_k_;
+  float const* params_scale_k_iter;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// HOPPER SOFTMAX
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, typename Kernel_traits, int WARPS_N>
+struct Softmax_gmma_base {};
+
+template <typename Traits_, typename Cta_tile_, typename Kernel_traits_>
+struct Softmax_gmma_base<Traits_, Cta_tile_, Kernel_traits_, 1> {
+  // The instruction traits.
+  using Traits = Traits_;
+  // The Cta_tile.
+  using Cta_tile = Cta_tile_;
+  // The Kernel traits.
+  using Kernel_traits = Kernel_traits_;
+  // The accumulators.
+  using Accumulator = fmha::Fragment_accumulator<Traits>;
+  // The Mma tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  static_assert(Cta_tile::WARPS_M == 4);
+  static_assert(Mma_tile::M_PER_MMA_PER_CTA == 64);
+
+  // The number of MMAs in M/N dimensions.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  enum { MMAS_N = Mma_tile::MMAS_N };
+
+  // Elements per thread per core matrix.
+  enum { ELTS_PER_THREAD = 2 };
+
+  // Core matrix is always 8x4.
+  enum { THREADS_PER_ROW = 4 };
+
+  enum { SMEM_BYTES = 0 };
+
+  // The number of rows accessed by each thread.
+  enum {
+    ROWS_PER_THREAD =
+        Traits::GMMA_M / (Cta_tile::THREADS_PER_WARP / THREADS_PER_ROW) / Cta_tile::WARPS_M
+  };
+
+  static_assert(ROWS_PER_THREAD == Mma_tile::ROWS_PER_THREAD);
+
+  // The number of columns access by each thread.
+  // Note there are 2 elements per reg.
+  enum { COLS_PER_THREAD = Traits::GMMA_N / THREADS_PER_ROW / ELTS_PER_THREAD };
+
+  // The number of total elements per thread.
+  enum { TOTAL_ELTS_PER_THREAD = ELTS_PER_THREAD * COLS_PER_THREAD };
+
+  template <typename Params>
+  inline __device__ Softmax_gmma_base(Params const& params, void*, int const, int const)
+      : params_scale_bmm1_(params.scale_bmm1),
+        params_softcapping_scale_bmm1_(params.softcapping_scale_bmm1) {}
+
+  // Apply mask before softmax. Use 1 byte per MMA distributed as 2x4.
+  template <typename Mask>
+  inline __device__ void apply_mask(Mask const& mask) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ii = 0; ii < ROWS_PER_THREAD; ++ii) {
+#pragma unroll
+        for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+          for (int jj = 0; jj < TOTAL_ELTS_PER_THREAD; ++jj) {
+            if (!mask.is_valid(mi, ni, ii, jj)) {
+              this->elt_[ROWS_PER_THREAD * mi + ii][TOTAL_ELTS_PER_THREAD * ni + jj] = -FLT_MAX;
+            }
+          }  // jj
+        }  // ni
+      }  // ii
+    }  // mi
+  }
+
+  template <typename Mask, typename AlibiParams>
+  inline __device__ void apply_mask_alibi(Mask const& mask, int head_id,
+                                          AlibiParams const& alibi_params) {
+    // 'if constexpr' because ALiBi is only defined for causal masks
+    if constexpr (Kernel_traits::CAUSAL_MASK) {
+      float m = get_alibi_head_scaling_factor<AlibiParams>(head_id, alibi_params);
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+        for (int ii = 0; ii < ROWS_PER_THREAD; ++ii) {
+#pragma unroll
+          for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+            for (int jj = 0; jj < TOTAL_ELTS_PER_THREAD; ++jj) {
+              int row, col;
+              mask.get_row_col(row, col, mi, ni, ii, jj);
+              if (mask.is_valid(row, col)) {
+                // Since softmax is shift invariant,
+                //  it is sufficient just to use the column as the multiplier
+                elt_[ROWS_PER_THREAD * mi + ii][TOTAL_ELTS_PER_THREAD * ni + jj] =
+                    elt_[ROWS_PER_THREAD * mi + ii][TOTAL_ELTS_PER_THREAD * ni + jj] *
+                        alibi_params.scale_after_alibi +
+                    m * (col + alibi_params.sequence_pos_offset);
+              } else {
+                elt_[ROWS_PER_THREAD * mi + ii][TOTAL_ELTS_PER_THREAD * ni + jj] = -FLT_MAX;
+              }
+            }
+          }
+        }
+      }
+    } else {
+      __builtin_unreachable();
+    }
+  }
+
+  // Do a CTA-wide reduction.
+  template <typename Functor>
+  inline __device__ void reduce_4x1(float (&dst)[MMAS_M * ROWS_PER_THREAD]) {
+#if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+    static_assert(MMAS_N * COLS_PER_THREAD * ELTS_PER_THREAD == MMAS_N * Mma_tile::CORES_N * 2);
+    if (Functor::IS_SUM) {
+// Apply the summation inside the thread.
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M * ROWS_PER_THREAD; ++mi) {
+        dst[mi] = (this->elt_[mi][0] + this->elt_[mi][1]);
+#pragma unroll
+        for (int ni = 1; ni < MMAS_N * Mma_tile::CORES_N; ni++) {
+          dst[mi] += (this->elt_[mi][ni * 2 + 0] + this->elt_[mi][ni * 2 + 1]);
+        }
+      }
+    } else
+#endif  // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+    {
+// find the max/sum for each row.
+// For hopper, each row is held entirely within 4 threads.
+// Apply the functor for each row inside a thread.
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M * ROWS_PER_THREAD; ++mi) {
+        dst[mi] = this->elt_[mi][0];
+#pragma unroll
+        for (int ni = 1; ni < MMAS_N * COLS_PER_THREAD * ELTS_PER_THREAD; ++ni) {
+          dst[mi] = Functor::apply(dst[mi], this->elt_[mi][ni]);
+        }
+      }
+    }
+// Apply the functor for each row inside each group of 4 threads.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * ROWS_PER_THREAD; ++mi) {
+      dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 1));
+      __syncwarp();
+      dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 2));
+      __syncwarp();
+    }
+  }
+
+  // Do a CTA-wide reduction.
+  template <typename Functor>
+  inline __device__ void reduce(float (&dst)[MMAS_M * ROWS_PER_THREAD]) {
+    reduce_4x1<Functor>(dst);
+  }
+
+  // Apply the exp to all the elements.
+  inline __device__ void apply_exp(float const (&max)[MMAS_M * ROWS_PER_THREAD]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * ROWS_PER_THREAD; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * COLS_PER_THREAD * ELTS_PER_THREAD; ++ni) {
+        this->elt_[mi][ni] = apply_exp_<Kernel_traits::VERSION>(this->elt_[mi][ni], max[mi]);
+      }
+    }
+  }
+
+  // Scale all the elements.
+  inline __device__ void scale(float const (&sum)[MMAS_M * ROWS_PER_THREAD]) {
+    // Precompute the inverse sum to normalize. Without -use_fast_math, it makes a huge deal.
+    float inv_sum[MMAS_M * ROWS_PER_THREAD];
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * ROWS_PER_THREAD; ++mi) {
+      inv_sum[mi] = (sum[mi] == 0.f || sum[mi] != sum[mi]) ? 1.f : 1.f / sum[mi];
+    }
+
+// Update the values.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * ROWS_PER_THREAD; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * COLS_PER_THREAD * ELTS_PER_THREAD; ++ni) {
+        this->elt_[mi][ni] *= inv_sum[mi];
+      }
+    }
+  }
+
+  // The scalig factor. Depens on acc type, e.g. float for 32-bit and fp16x2/bf16x2 for 16-bit.
+  uint32_t const params_scale_bmm1_;
+  float const params_softcapping_scale_bmm1_;
+  // The elements.
+  float elt_[MMAS_M * ROWS_PER_THREAD][MMAS_N * COLS_PER_THREAD * ELTS_PER_THREAD];
+};
+
+template <typename Traits, typename Cta_tile, typename Kernel_traits>
+struct Softmax_gmma_base<Traits, Cta_tile, Kernel_traits, 2>
+    : public Softmax_gmma_base<Traits, Cta_tile, Kernel_traits, 1> {
+  using Base = Softmax_gmma_base<Traits, Cta_tile, Kernel_traits, 1>;
+
+  using Mma_tile = typename Base::Mma_tile;
+
+  enum { BYTES_PER_SMEM = Mma_tile::M_PER_MMA_PER_CTA * Cta_tile::WARPS_N * sizeof(float) };
+
+  enum { ELTS_PER_ROW = 2 };
+
+  static_assert(Cta_tile::WARPS_N == 2);
+  static_assert(Cta_tile::WARPS_M == 4);
+  static_assert(Mma_tile::M_PER_MMA_PER_CTA == 64);
+
+  template <typename Params>
+  inline __device__ Softmax_gmma_base(Params const& params, void* smem, int const bidb,
+                                      int const tidx)
+      : Base(params, smem, bidb, tidx) {
+    int const warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int const warp_n = warp / 4;
+    int const warp_m = warp % 4;
+    int const lane = tidx % Cta_tile::THREADS_PER_WARP;
+    int const quad = lane / 4;
+    is_writer_ = lane % 4 == 0;
+
+    int const col = warp_n;
+    int const row = warp_m * 16 + quad;
+
+    smem_write_ = static_cast<float*>(smem) + row * 2 + col;
+    smem_read_ = static_cast<float2*>(smem) + row;
+  }
+
+  // Do a CTA-wide reduction.
+  template <typename Functor>
+  inline __device__ void reduce(float (&dst)[2]) {
+    Base::template reduce_4x1<Functor>(dst);
+    if (is_writer_) {
+      smem_write_[0 * ELTS_PER_ROW] = dst[0];
+      smem_write_[8 * ELTS_PER_ROW] = dst[1];
+    }
+    __syncthreads();
+    float2 tmp0 = smem_read_[0];
+    float2 tmp1 = smem_read_[8];
+    dst[0] = Functor::apply(tmp0.x, tmp0.y);
+    dst[1] = Functor::apply(tmp1.x, tmp1.y);
+  }
+
+  float* smem_write_;
+  float2* smem_read_;
+  bool is_writer_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Cta_tile_,
+          typename Kernel_traits_>
+struct Softmax<fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+               Cta_tile_, Kernel_traits_>
+    : public Softmax_gmma_base<
+          fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile_,
+          Kernel_traits_, Cta_tile_::WARPS_N> {
+  // The traits.
+  using Traits = fmha::Hopper_hgmma_fp16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+  // Cta_tile.
+  using Cta_tile = Cta_tile_;
+  // Kernel_traits.
+  using Kernel_traits = Kernel_traits_;
+  // The Base class.
+  using Base = Softmax_gmma_base<Traits, Cta_tile, Kernel_traits, Cta_tile::WARPS_N>;
+  // The accumulators.
+  using Accumulator = typename Base::Accumulator;
+  // The Mma tile.
+  using Mma_tile = typename Base::Mma_tile;
+
+  // The number of MMAs in M/N dimensions.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  enum { MMAS_N = Mma_tile::MMAS_N };
+
+  // for HGMMA_FP16, there are 2 elements per RF for ACC.
+  enum { ELTS_PER_THREAD = 2 };
+
+  // for Hopper HGMMA, each row is held within 4 threads.
+  enum { THREADS_PER_ROW = 4 };
+
+  // The number of rows accessed by each thread.
+  enum {
+    ROWS_PER_THREAD =
+        Traits::GMMA_M / (Cta_tile::THREADS_PER_WARP / THREADS_PER_ROW) / Cta_tile::WARPS_M
+  };
+
+  // The number of columns access by each thread.
+  // Note there are 2 elements per reg.
+  enum { COLS_PER_THREAD = Traits::GMMA_N / THREADS_PER_ROW / ELTS_PER_THREAD };
+
+  // Use BMM1 softcapping scale or not.
+  enum { ENABLE_BMM1_SOFTCAPPING_SCALE = Kernel_traits::ENABLE_BMM1_SOFTCAPPING_SCALE };
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx) {}
+
+  // Convert from FP16 fragments to floats.
+  inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int col_idx = 0; col_idx < COLS_PER_THREAD; ++col_idx) {
+#pragma unroll
+          for (int row_idx = 0; row_idx < ROWS_PER_THREAD; ++row_idx) {
+            // the order of the acc rf is we traverse vertically first
+            // then we traverse horizontally.
+
+            // Normalize the values.
+            uint32_t acc_0 = fmha::hmul2(acc[mi][ni].reg(col_idx * ROWS_PER_THREAD + row_idx),
+                                         this->params_scale_bmm1_);
+            // Element index.
+            int elt_row_idx = ROWS_PER_THREAD * mi + row_idx;
+            int elt_col_idx = COLS_PER_THREAD * ELTS_PER_THREAD * ni + col_idx * ELTS_PER_THREAD;
+            // Extract the values as floats.
+            half2_to_float2(this->elt_[elt_row_idx][elt_col_idx + 0],
+                            this->elt_[elt_row_idx][elt_col_idx + 1], acc_0);
+            // Attention logit softcapping scale.
+            // 1.0f / softcapping_scale has been fused to scale_bmm1.
+            if constexpr (ENABLE_BMM1_SOFTCAPPING_SCALE) {
+              this->elt_[elt_row_idx][elt_col_idx + 0] =
+                  this->params_softcapping_scale_bmm1_ *
+                  __tanhf(this->elt_[elt_row_idx][elt_col_idx + 0]);
+              this->elt_[elt_row_idx][elt_col_idx + 1] =
+                  this->params_softcapping_scale_bmm1_ *
+                  __tanhf(this->elt_[elt_row_idx][elt_col_idx + 1]);
+            }
+          }  // row_idx
+        }  // col_idx
+      }  // ni
+    }  // mi
+  }
+
+  // Store the tile after softmax.
+  template <typename Gmem_tile>
+  inline __device__ void store(Gmem_tile& gmem_tile) {
+    Accumulator acc[MMAS_M][MMAS_N];
+
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int col_idx = 0; col_idx < COLS_PER_THREAD; ++col_idx) {
+#pragma unroll
+          for (int row_idx = 0; row_idx < ROWS_PER_THREAD; ++row_idx) {
+            // the order of the acc rf is we traverse vertically first
+            // then we traverse horizontally.
+            float tmp_00 =
+                this->elt_[ROWS_PER_THREAD * mi + row_idx]
+                          [COLS_PER_THREAD * ELTS_PER_THREAD * ni + col_idx * ELTS_PER_THREAD + 0];
+            float tmp_01 =
+                this->elt_[ROWS_PER_THREAD * mi + row_idx]
+                          [COLS_PER_THREAD * ELTS_PER_THREAD * ni + col_idx * ELTS_PER_THREAD + 1];
+            acc[mi][ni].reg(col_idx * ROWS_PER_THREAD + row_idx) =
+                fmha::float2_to_half2(tmp_00, tmp_01);
+          }  // row_idx
+        }  // col_idx
+      }  // ni
+    }  // m
+
+    // Delegate to the gmem tile to store.
+    gmem_tile.store(acc);
+  }
+
+  // Pack the data to a fragment for the next GEMM.
+  template <typename Fragment_a, int K, int M>
+  inline __device__ void pack(Fragment_a (&dst)[K][M]) const {
+// we know the instruction shape is 64xNx16
+// Thus for input A matrix, it is of size 64x16 per warpgroup.
+// Thus, each threads access 2 rows and 4 columns. contiguous 2 columns are held by 1 RF.
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ki = 0; ki < K; ++ki) {
+        // 1st row - 4 elements per row.
+        float tmp_00 = this->elt_[2 * mi + 0][4 * ki + 0];
+        float tmp_01 = this->elt_[2 * mi + 0][4 * ki + 1];
+        float tmp_02 = this->elt_[2 * mi + 0][4 * ki + 2];
+        float tmp_03 = this->elt_[2 * mi + 0][4 * ki + 3];
+
+        // 2nd row - 4 elements per row.
+        float tmp_10 = this->elt_[2 * mi + 1][4 * ki + 0];
+        float tmp_11 = this->elt_[2 * mi + 1][4 * ki + 1];
+        float tmp_12 = this->elt_[2 * mi + 1][4 * ki + 2];
+        float tmp_13 = this->elt_[2 * mi + 1][4 * ki + 3];
+
+        // Pack to 4 registers.
+        dst[ki][mi].reg(0) = fmha::float2_to_half2(tmp_00, tmp_01);
+        dst[ki][mi].reg(1) = fmha::float2_to_half2(tmp_10, tmp_11);
+        dst[ki][mi].reg(2) = fmha::float2_to_half2(tmp_02, tmp_03);
+        dst[ki][mi].reg(3) = fmha::float2_to_half2(tmp_12, tmp_13);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Cta_tile_,
+          typename Kernel_traits_>
+struct Softmax<fmha::Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+               Cta_tile_, Kernel_traits_>
+    : public Softmax_gmma_base<
+          fmha::Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile_,
+          Kernel_traits_, Cta_tile_::WARPS_N> {
+  // The traits.
+  using Traits = fmha::Hopper_hgmma_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+  // Cta_tile.
+  using Cta_tile = Cta_tile_;
+  // Kernel_traits.
+  using Kernel_traits = Kernel_traits_;
+  // The Base class.
+  using Base = Softmax_gmma_base<Traits, Cta_tile, Kernel_traits, Cta_tile::WARPS_N>;
+  // The accumulators.
+  using Accumulator = typename Base::Accumulator;
+  // The Mma tile.
+  using Mma_tile = typename Base::Mma_tile;
+
+  // The number of MMAs in M/N dimensions.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  enum { MMAS_N = Mma_tile::MMAS_N };
+
+  // for HGMMA_FP16, there are 2 elements per RF for ACC.
+  enum { ELTS_PER_THREAD = 2 };
+
+  // for Hopper HGMMA, each row is held within 4 threads.
+  enum { THREADS_PER_ROW = 4 };
+
+  // The number of rows accessed by each thread.
+  enum {
+    ROWS_PER_THREAD =
+        Traits::GMMA_M / (Cta_tile::THREADS_PER_WARP / THREADS_PER_ROW) / Cta_tile::WARPS_M
+  };
+
+  // The number of columns access by each thread.
+  // Note there are 2 elements per reg.
+  enum { COLS_PER_THREAD = Traits::GMMA_N / THREADS_PER_ROW / ELTS_PER_THREAD };
+
+  // Use BMM1 softcapping scale or not.
+  enum { ENABLE_BMM1_SOFTCAPPING_SCALE = Kernel_traits::ENABLE_BMM1_SOFTCAPPING_SCALE };
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx) {}
+
+  // Convert from FP16 fragments to floats.
+  inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) {
+    float const& scale_f = reinterpret_cast<float const&>(this->params_scale_bmm1_);
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int col_idx = 0; col_idx < COLS_PER_THREAD; ++col_idx) {
+#pragma unroll
+          for (int row_idx = 0; row_idx < ROWS_PER_THREAD; ++row_idx) {
+            // the order of the acc rf is we traverse vertically first
+            // then we traverse horizontally.
+            int elt_row = ROWS_PER_THREAD * mi + row_idx;
+            int elt_col = COLS_PER_THREAD * ELTS_PER_THREAD * ni + col_idx * ELTS_PER_THREAD;
+
+            float elt0 = acc[mi][ni].elt(col_idx * 2 * ROWS_PER_THREAD + 2 * row_idx + 0) * scale_f;
+            float elt1 = acc[mi][ni].elt(col_idx * 2 * ROWS_PER_THREAD + 2 * row_idx + 1) * scale_f;
+
+            // 1.0f / softcapping_scale has been fused to scale_bmm1.
+            if constexpr (ENABLE_BMM1_SOFTCAPPING_SCALE) {
+              elt0 = this->params_softcapping_scale_bmm1_ * __tanhf(elt0);
+              elt1 = this->params_softcapping_scale_bmm1_ * __tanhf(elt1);
+            }
+
+            this->elt_[elt_row][elt_col + 0] = elt0;
+            this->elt_[elt_row][elt_col + 1] = elt1;
+
+          }  // row_idx
+        }  // col_idx
+      }  // ni
+    }  // mi
+  }
+
+  // Store the tile after softmax.
+  template <typename Gmem_tile>
+  inline __device__ void store(Gmem_tile& gmem_tile) {
+    Accumulator acc[MMAS_M][MMAS_N];
+
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int col_idx = 0; col_idx < COLS_PER_THREAD; ++col_idx) {
+#pragma unroll
+          for (int row_idx = 0; row_idx < ROWS_PER_THREAD; ++row_idx) {
+            // the order of the acc rf is we traverse vertically first
+            // then we traverse horizontally
+            int elt_row = ROWS_PER_THREAD * mi + row_idx;
+            int elt_col = COLS_PER_THREAD * ELTS_PER_THREAD * ni + col_idx * ELTS_PER_THREAD;
+            float elt0 = this->elt_[elt_row][elt_col + 0];
+            float elt1 = this->elt_[elt_row][elt_col + 1];
+
+            acc[mi][ni].elt(col_idx * 2 * ROWS_PER_THREAD + 2 * row_idx + 0) = elt0;
+            acc[mi][ni].elt(col_idx * 2 * ROWS_PER_THREAD + 2 * row_idx + 1) = elt1;
+          }  // row_idx
+        }  // col_idx
+      }  // ni
+    }  // m
+
+    // Delegate to the gmem tile to store.
+    gmem_tile.store(acc);
+  }
+
+  // Pack the data to a fragment for the next GEMM.
+  template <typename Fragment_a, int K, int M>
+  inline __device__ void pack(Fragment_a (&dst)[K][M]) const {
+// we know the instruction shape is 64xNx16
+// Thus for input A matrix, it is of size 64x16 per warpgroup.
+// Thus, each threads access 2 rows and 4 columns. contiguous 2 columns are held by 1 RF.
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ki = 0; ki < K; ++ki) {
+        // 1st row - 4 elements per row.
+        float tmp_00 = this->elt_[2 * mi + 0][4 * ki + 0];
+        float tmp_01 = this->elt_[2 * mi + 0][4 * ki + 1];
+        float tmp_02 = this->elt_[2 * mi + 0][4 * ki + 2];
+        float tmp_03 = this->elt_[2 * mi + 0][4 * ki + 3];
+
+        // 2nd row - 4 elements per row.
+        float tmp_10 = this->elt_[2 * mi + 1][4 * ki + 0];
+        float tmp_11 = this->elt_[2 * mi + 1][4 * ki + 1];
+        float tmp_12 = this->elt_[2 * mi + 1][4 * ki + 2];
+        float tmp_13 = this->elt_[2 * mi + 1][4 * ki + 3];
+
+        // Pack to 4 registers.
+        dst[ki][mi].reg(0) = fmha::float2_to_half2(tmp_00, tmp_01);
+        dst[ki][mi].reg(1) = fmha::float2_to_half2(tmp_10, tmp_11);
+        dst[ki][mi].reg(2) = fmha::float2_to_half2(tmp_02, tmp_03);
+        dst[ki][mi].reg(3) = fmha::float2_to_half2(tmp_12, tmp_13);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Cta_tile_,
+          typename Kernel_traits_>
+struct Softmax<fmha::Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+               Cta_tile_, Kernel_traits_>
+    : public Softmax_gmma_base<
+          fmha::Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>, Cta_tile_,
+          Kernel_traits_, Cta_tile_::WARPS_N> {
+  // The traits.
+  using Traits = fmha::Hopper_hgmma_bf16_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+  // Cta_tile.
+  using Cta_tile = Cta_tile_;
+  // Kernel_traits.
+  using Kernel_traits = Kernel_traits_;
+  // The Base class.
+  using Base = Softmax_gmma_base<Traits, Cta_tile, Kernel_traits, Cta_tile::WARPS_N>;
+  // The accumulators.
+  using Accumulator = typename Base::Accumulator;
+  // The Mma tile.
+  using Mma_tile = typename Base::Mma_tile;
+
+  // The number of MMAs in M/N dimensions.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  enum { MMAS_N = Mma_tile::MMAS_N };
+
+  // for HGMMA_FP16, there are 2 elements per RF for ACC.
+  enum { ELTS_PER_THREAD = 2 };
+
+  // for Hopper HGMMA, each row is held within 4 threads.
+  enum { THREADS_PER_ROW = 4 };
+
+  // The number of rows accessed by each thread.
+  enum {
+    ROWS_PER_THREAD =
+        Traits::GMMA_M / (Cta_tile::THREADS_PER_WARP / THREADS_PER_ROW) / Cta_tile::WARPS_M
+  };
+
+  // The number of columns access by each thread.
+  // Note there are 2 elements per reg.
+  enum { COLS_PER_THREAD = Traits::GMMA_N / THREADS_PER_ROW / ELTS_PER_THREAD };
+
+  // Use BMM1 softcapping scale or not.
+  enum { ENABLE_BMM1_SOFTCAPPING_SCALE = Kernel_traits::ENABLE_BMM1_SOFTCAPPING_SCALE };
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx) {}
+
+  // Convert from FP16 fragments to floats.
+  inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) {
+    float const& scale_f = reinterpret_cast<float const&>(this->params_scale_bmm1_);
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int col_idx = 0; col_idx < COLS_PER_THREAD; ++col_idx) {
+#pragma unroll
+          for (int row_idx = 0; row_idx < ROWS_PER_THREAD; ++row_idx) {
+            // the order of the acc rf is we traverse vertically first
+            // then we traverse horizontally.
+            int elt_row = ROWS_PER_THREAD * mi + row_idx;
+            int elt_col = COLS_PER_THREAD * ELTS_PER_THREAD * ni + col_idx * ELTS_PER_THREAD;
+
+            float elt0 = acc[mi][ni].elt(col_idx * 2 * ROWS_PER_THREAD + 2 * row_idx + 0) * scale_f;
+            float elt1 = acc[mi][ni].elt(col_idx * 2 * ROWS_PER_THREAD + 2 * row_idx + 1) * scale_f;
+
+            if constexpr (ENABLE_BMM1_SOFTCAPPING_SCALE) {
+              elt0 = this->params_softcapping_scale_bmm1_ * __tanhf(elt0);
+              elt1 = this->params_softcapping_scale_bmm1_ * __tanhf(elt1);
+            }
+
+            this->elt_[elt_row][elt_col + 0] = elt0;
+            this->elt_[elt_row][elt_col + 1] = elt1;
+
+          }  // row_idx
+        }  // col_idx
+      }  // ni
+    }  // mi
+  }
+
+  // Store the tile after softmax.
+  template <typename Gmem_tile>
+  inline __device__ void store(Gmem_tile& gmem_tile) {
+    Accumulator acc[MMAS_M][MMAS_N];
+
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int col_idx = 0; col_idx < COLS_PER_THREAD; ++col_idx) {
+#pragma unroll
+          for (int row_idx = 0; row_idx < ROWS_PER_THREAD; ++row_idx) {
+            // the order of the acc rf is we traverse vertically first
+            // then we traverse horizontally.
+            int elt_row = ROWS_PER_THREAD * mi + row_idx;
+            int elt_col = COLS_PER_THREAD * ELTS_PER_THREAD * ni + col_idx * ELTS_PER_THREAD;
+            float elt0 = this->elt_[elt_row][elt_col + 0];
+            float elt1 = this->elt_[elt_row][elt_col + 1];
+
+            acc[mi][ni].elt(col_idx * 2 * ROWS_PER_THREAD + 2 * row_idx + 0) = elt0;
+            acc[mi][ni].elt(col_idx * 2 * ROWS_PER_THREAD + 2 * row_idx + 1) = elt1;
+          }  // row_idx
+        }  // col_idx
+      }  // ni
+    }  // m
+
+    // Delegate to the gmem tile to store.
+    gmem_tile.store(acc);
+  }
+
+  // Pack the data to a fragment for the next GEMM.
+  template <typename Fragment_a, int K, int M>
+  inline __device__ void pack(Fragment_a (&dst)[K][M]) const {
+// we know the instruction shape is 64xNx16
+// Thus for input A matrix, it is of size 64x16 per warpgroup.
+// Thus, each threads access 2 rows and 4 columns. contiguous 2 columns are held by 1 RF.
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ki = 0; ki < K; ++ki) {
+        // 1st row - 4 elements per row.
+        float tmp_00 = this->elt_[2 * mi + 0][4 * ki + 0];
+        float tmp_01 = this->elt_[2 * mi + 0][4 * ki + 1];
+        float tmp_02 = this->elt_[2 * mi + 0][4 * ki + 2];
+        float tmp_03 = this->elt_[2 * mi + 0][4 * ki + 3];
+
+        // 2nd row - 4 elements per row.
+        float tmp_10 = this->elt_[2 * mi + 1][4 * ki + 0];
+        float tmp_11 = this->elt_[2 * mi + 1][4 * ki + 1];
+        float tmp_12 = this->elt_[2 * mi + 1][4 * ki + 2];
+        float tmp_13 = this->elt_[2 * mi + 1][4 * ki + 3];
+
+        // Pack to 4 registers.
+        dst[ki][mi].reg(0) = fmha::float2_to_bf16_x2(tmp_00, tmp_01);
+        dst[ki][mi].reg(1) = fmha::float2_to_bf16_x2(tmp_10, tmp_11);
+        dst[ki][mi].reg(2) = fmha::float2_to_bf16_x2(tmp_02, tmp_03);
+        dst[ki][mi].reg(3) = fmha::float2_to_bf16_x2(tmp_12, tmp_13);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits, typename Cta_tile, typename Kernel_traits>
+struct Softmax_gmma_32bit_8bit_base
+    : public Softmax_gmma_base<Traits, Cta_tile, Kernel_traits, Cta_tile::WARPS_N> {
+  // The Base class.
+  using Base = Softmax_gmma_base<Traits, Cta_tile, Kernel_traits, Cta_tile::WARPS_N>;
+  // The accumulators.
+  using Accumulator = typename Base::Accumulator;
+  // The Mma tile.
+  using Mma_tile = typename Base::Mma_tile;
+
+  // The number of MMAs in M/N dimensions.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  enum { MMAS_N = Mma_tile::MMAS_N };
+
+  // TODO these should be general.
+  // Two elts per thread per acc core matrix.
+  enum { ELTS_PER_THREAD = 2 };
+
+  // Number of threads per row of the acc core matrix.
+  enum { THREADS_PER_ROW = 4 };
+
+  // The number of rows accessed by each thread per GMMA.
+  enum {
+    ROWS_PER_THREAD =
+        Traits::GMMA_M / (Cta_tile::THREADS_PER_WARP / THREADS_PER_ROW) / Cta_tile::WARPS_M
+  };
+
+  // The number of columns access by each thread.
+  enum { COLS_PER_THREAD = Traits::GMMA_N / THREADS_PER_ROW / ELTS_PER_THREAD };
+
+  // Check the expected number of accumulator elements.
+  static_assert(Accumulator::NUM_ELTS == COLS_PER_THREAD * ROWS_PER_THREAD * ELTS_PER_THREAD);
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax_gmma_32bit_8bit_base(Params const& params, void* smem, int bidb,
+                                                 int tidx)
+      : Base(params, smem, bidb, tidx) {}
+
+  inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) {
+    float const scalef = reinterpret_cast<float const&>(this->params_scale_bmm1_);
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < COLS_PER_THREAD; ++ii) {
+          float tmp_00 =
+              acc[mi][ni].elt(ii * ROWS_PER_THREAD * ELTS_PER_THREAD + 0 * ELTS_PER_THREAD + 0) *
+              scalef;
+          float tmp_01 =
+              acc[mi][ni].elt(ii * ROWS_PER_THREAD * ELTS_PER_THREAD + 0 * ELTS_PER_THREAD + 1) *
+              scalef;
+          float tmp_10 =
+              acc[mi][ni].elt(ii * ROWS_PER_THREAD * ELTS_PER_THREAD + 1 * ELTS_PER_THREAD + 0) *
+              scalef;
+          float tmp_11 =
+              acc[mi][ni].elt(ii * ROWS_PER_THREAD * ELTS_PER_THREAD + 1 * ELTS_PER_THREAD + 1) *
+              scalef;
+          int n_offset = ni * COLS_PER_THREAD * ELTS_PER_THREAD + ii * ELTS_PER_THREAD;
+          this->elt_[mi * ROWS_PER_THREAD + 0][n_offset + 0] = tmp_00;
+          this->elt_[mi * ROWS_PER_THREAD + 0][n_offset + 1] = tmp_01;
+          this->elt_[mi * ROWS_PER_THREAD + 1][n_offset + 0] = tmp_10;
+          this->elt_[mi * ROWS_PER_THREAD + 1][n_offset + 1] = tmp_11;
+        }  // ii
+      }  // ni
+    }  // mi
+  }
+
+  inline __device__ void unpack_noscale(Accumulator const (&acc)[MMAS_M][MMAS_N]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < COLS_PER_THREAD; ++ii) {
+          float tmp_00 =
+              acc[mi][ni].elt(ii * ROWS_PER_THREAD * ELTS_PER_THREAD + 0 * ELTS_PER_THREAD + 0);
+          float tmp_01 =
+              acc[mi][ni].elt(ii * ROWS_PER_THREAD * ELTS_PER_THREAD + 0 * ELTS_PER_THREAD + 1);
+          float tmp_10 =
+              acc[mi][ni].elt(ii * ROWS_PER_THREAD * ELTS_PER_THREAD + 1 * ELTS_PER_THREAD + 0);
+          float tmp_11 =
+              acc[mi][ni].elt(ii * ROWS_PER_THREAD * ELTS_PER_THREAD + 1 * ELTS_PER_THREAD + 1);
+          int n_offset = ni * COLS_PER_THREAD * ELTS_PER_THREAD + ii * ELTS_PER_THREAD;
+          this->elt_[mi * ROWS_PER_THREAD + 0][n_offset + 0] = tmp_00;
+          this->elt_[mi * ROWS_PER_THREAD + 0][n_offset + 1] = tmp_01;
+          this->elt_[mi * ROWS_PER_THREAD + 1][n_offset + 0] = tmp_10;
+          this->elt_[mi * ROWS_PER_THREAD + 1][n_offset + 1] = tmp_11;
+        }  // ii
+      }  // ni
+    }  // mi
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Cta_tile,
+          typename Kernel_traits>
+struct Softmax<fmha::Hopper_qgmma_e4m3_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+               Cta_tile, Kernel_traits>
+    : public Softmax_gmma_32bit_8bit_base<
+          fmha::Hopper_qgmma_e4m3_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+          Cta_tile, Kernel_traits> {
+  // The traits.
+  using Traits = fmha::Hopper_qgmma_e4m3_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+  // The Base class.
+  using Base = Softmax_gmma_32bit_8bit_base<Traits, Cta_tile, Kernel_traits>;
+
+  using Accumulator = typename Base::Accumulator;
+
+  enum {
+    MMAS_M = Base::MMAS_M,
+    MMAS_N = Base::MMAS_N,
+    ROWS_PER_THREAD = Base::ROWS_PER_THREAD,
+    COLS_PER_THREAD = Base::COLS_PER_THREAD,
+    ELTS_PER_THREAD = Base::ELTS_PER_THREAD,
+  };
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx), params_scale_softmax_(params.scale_softmax) {}
+
+  // Store the tile after softmax.
+  template <typename Gmem_tile>
+  inline __device__ void store(Gmem_tile& gmem_tile) {
+    float const scale = reinterpret_cast<float const&>(this->params_scale_softmax_);
+
+    Accumulator acc[MMAS_M][MMAS_N];
+
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < COLS_PER_THREAD; ++ii) {
+          int row = mi * ROWS_PER_THREAD;
+          int col = ni * COLS_PER_THREAD * ELTS_PER_THREAD + ii * ELTS_PER_THREAD;
+          float tmp_00 = this->elt_[row + 0][col + 0] * scale;
+          float tmp_01 = this->elt_[row + 0][col + 1] * scale;
+          float tmp_10 = this->elt_[row + 1][col + 0] * scale;
+          float tmp_11 = this->elt_[row + 1][col + 1] * scale;
+
+          int elt_idx = ii * ROWS_PER_THREAD * ELTS_PER_THREAD;
+          acc[mi][ni].elt(elt_idx + 0 * ELTS_PER_THREAD + 0) = tmp_00;
+          acc[mi][ni].elt(elt_idx + 0 * ELTS_PER_THREAD + 1) = tmp_01;
+          acc[mi][ni].elt(elt_idx + 1 * ELTS_PER_THREAD + 0) = tmp_10;
+          acc[mi][ni].elt(elt_idx + 1 * ELTS_PER_THREAD + 1) = tmp_11;
+        }  // ii
+      }  // ni
+    }  // mi
+
+    // Delegate to the gmem tile to store.
+    gmem_tile.store(acc);
+  }
+
+  // Pack the data to a fragment for the next GEMM.
+  template <typename Fragment_a, int K, int M>
+  inline __device__ void pack(Fragment_a (&dst)[K][M]) const {
+    static_assert(M == 1);
+    static_assert(Fragment_a::NUM_REGS == 4);
+    static_assert(Fragment_a::NUM_ELTS == 16);
+    // Acc per warp: 16 x 256 FP32
+    // A is 8 times(in K) 16 x 32 FP8, i.e. 4 registers per thread.
+
+    static_assert(MMAS_N * COLS_PER_THREAD * ELTS_PER_THREAD % 8 == 0);
+    static_assert(MMAS_N * COLS_PER_THREAD * ELTS_PER_THREAD == K * Fragment_a::NUM_ELTS / 2);
+
+    float const scale = reinterpret_cast<float const&>(this->params_scale_softmax_);
+
+// The canonical layout in K should be R0: [0,1,2,3] R2: [16,17,18,19]
+// Note below that this is not possible with the register layout of the accumulator.
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ki = 0; ki < K; ++ki) {
+        // 1st row - 8 elements per row.
+        float tmp_00 = this->elt_[2 * mi + 0][8 * ki + 0] * scale;  // + 0
+        float tmp_01 = this->elt_[2 * mi + 0][8 * ki + 1] * scale;  // + 1
+        float tmp_02 = this->elt_[2 * mi + 0][8 * ki + 2] * scale;  // + 8
+        float tmp_03 = this->elt_[2 * mi + 0][8 * ki + 3] * scale;  // + 9
+        float tmp_04 = this->elt_[2 * mi + 0][8 * ki + 4] * scale;  // +16
+        float tmp_05 = this->elt_[2 * mi + 0][8 * ki + 5] * scale;  // +17
+        float tmp_06 = this->elt_[2 * mi + 0][8 * ki + 6] * scale;  // +24
+        float tmp_07 = this->elt_[2 * mi + 0][8 * ki + 7] * scale;  // +25
+
+        // 2nd row - 4 elements per row.
+        float tmp_10 = this->elt_[2 * mi + 1][8 * ki + 0] * scale;  // + 0
+        float tmp_11 = this->elt_[2 * mi + 1][8 * ki + 1] * scale;  // + 1
+        float tmp_12 = this->elt_[2 * mi + 1][8 * ki + 2] * scale;  // + 8
+        float tmp_13 = this->elt_[2 * mi + 1][8 * ki + 3] * scale;  // + 9
+        float tmp_14 = this->elt_[2 * mi + 1][8 * ki + 4] * scale;  // +16
+        float tmp_15 = this->elt_[2 * mi + 1][8 * ki + 5] * scale;  // +17
+        float tmp_16 = this->elt_[2 * mi + 1][8 * ki + 6] * scale;  // +24
+        float tmp_17 = this->elt_[2 * mi + 1][8 * ki + 7] * scale;  // +25
+
+        // Pack to 4 registers.
+        dst[ki][mi].reg(0) = fmha::float4_to_fp8x4<Traits::A_type>(tmp_00, tmp_01, tmp_02, tmp_03);
+        dst[ki][mi].reg(1) = fmha::float4_to_fp8x4<Traits::A_type>(tmp_10, tmp_11, tmp_12, tmp_13);
+        dst[ki][mi].reg(2) = fmha::float4_to_fp8x4<Traits::A_type>(tmp_04, tmp_05, tmp_06, tmp_07);
+        dst[ki][mi].reg(3) = fmha::float4_to_fp8x4<Traits::A_type>(tmp_14, tmp_15, tmp_16, tmp_17);
+      }
+    }
+  }
+
+  uint32_t const params_scale_softmax_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M, int GMMA_N, int GMMA_K, bool GMMA_A_RF, bool GMMA_B_RF, typename Cta_tile,
+          typename Kernel_traits>
+struct Softmax<fmha::Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+               Cta_tile, Kernel_traits>
+    : public Softmax_gmma_32bit_8bit_base<
+          fmha::Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>,
+          Cta_tile, Kernel_traits> {
+  // The traits.
+  using Traits = fmha::Hopper_igmma_int8_int32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF>;
+
+  // The Base class.
+  using Base = Softmax_gmma_32bit_8bit_base<Traits, Cta_tile, Kernel_traits>;
+
+  using Accumulator = typename Base::Accumulator;
+
+  enum {
+    MMAS_M = Base::MMAS_M,
+    MMAS_N = Base::MMAS_N,
+    ROWS_PER_THREAD = Base::ROWS_PER_THREAD,
+    COLS_PER_THREAD = Base::COLS_PER_THREAD,
+    ELTS_PER_THREAD = Base::ELTS_PER_THREAD,
+  };
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx)
+      : Base(params, smem, bidb, tidx), params_scale_softmax_(params.scale_softmax) {}
+
+  // Store the tile after softmax.
+  template <typename Gmem_tile>
+  inline __device__ void store(Gmem_tile& gmem_tile) {
+    float const scale = reinterpret_cast<float const&>(this->params_scale_softmax_);
+    Accumulator acc[MMAS_M][MMAS_N];
+
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+        for (int ii = 0; ii < COLS_PER_THREAD; ++ii) {
+          int n_offset = ni * COLS_PER_THREAD * ELTS_PER_THREAD + ii * ELTS_PER_THREAD;
+          float tmp_00 = this->elt_[mi * ROWS_PER_THREAD + 0][n_offset + 0];
+          float tmp_01 = this->elt_[mi * ROWS_PER_THREAD + 0][n_offset + 1];
+          float tmp_10 = this->elt_[mi * ROWS_PER_THREAD + 1][n_offset + 0];
+          float tmp_11 = this->elt_[mi * ROWS_PER_THREAD + 1][n_offset + 1];
+
+          int elt_offset = ii * ROWS_PER_THREAD * ELTS_PER_THREAD;
+          acc[mi][ni].elt(elt_offset + 0 * ELTS_PER_THREAD + 0) = tmp_00 * scale;
+          acc[mi][ni].elt(elt_offset + 0 * ELTS_PER_THREAD + 1) = tmp_01 * scale;
+          acc[mi][ni].elt(elt_offset + 1 * ELTS_PER_THREAD + 0) = tmp_10 * scale;
+          acc[mi][ni].elt(elt_offset + 1 * ELTS_PER_THREAD + 1) = tmp_11 * scale;
+        }  // ii
+      }  // ni
+    }  // mi
+
+    // Delegate to the gmem tile to store.
+    gmem_tile.store(acc);
+  }
+
+  // Pack the data to a fragment for the next GEMM.
+  template <typename Fragment_a, int K, int M>
+  inline __device__ void pack(Fragment_a (&dst)[K][M]) const {
+    static_assert(M == 1);
+    static_assert(Fragment_a::NUM_REGS == 4);
+    static_assert(Fragment_a::NUM_ELTS == 16);
+    // Acc per warp: 16 x 256 FP32
+    // A is 8 times(in K) 16 x 32 FP8, i.e. 4 registers per thread.
+
+    static_assert(MMAS_N * COLS_PER_THREAD * ELTS_PER_THREAD % 8 == 0);
+    static_assert(MMAS_N * COLS_PER_THREAD * ELTS_PER_THREAD == K * Fragment_a::NUM_ELTS / 2);
+
+    float const scale = reinterpret_cast<float const&>(this->params_scale_softmax_);
+// The canonical layout in K should be R0: [0,1,2,3] R2: [16,17,18,19]
+// Note below that this is not possible with the register layout of the accumulator.
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ki = 0; ki < K; ++ki) {
+        // 1st row - 8 elements per row.
+        float tmp_00 = this->elt_[2 * mi + 0][8 * ki + 0] * scale;  // + 0
+        float tmp_01 = this->elt_[2 * mi + 0][8 * ki + 1] * scale;  // + 1
+        float tmp_02 = this->elt_[2 * mi + 0][8 * ki + 2] * scale;  // + 8
+        float tmp_03 = this->elt_[2 * mi + 0][8 * ki + 3] * scale;  // + 9
+        float tmp_04 = this->elt_[2 * mi + 0][8 * ki + 4] * scale;  // +16
+        float tmp_05 = this->elt_[2 * mi + 0][8 * ki + 5] * scale;  // +17
+        float tmp_06 = this->elt_[2 * mi + 0][8 * ki + 6] * scale;  // +24
+        float tmp_07 = this->elt_[2 * mi + 0][8 * ki + 7] * scale;  // +25
+
+        // 2nd row - 4 elements per row.
+        float tmp_10 = this->elt_[2 * mi + 1][8 * ki + 0] * scale;  // + 0
+        float tmp_11 = this->elt_[2 * mi + 1][8 * ki + 1] * scale;  // + 1
+        float tmp_12 = this->elt_[2 * mi + 1][8 * ki + 2] * scale;  // + 8
+        float tmp_13 = this->elt_[2 * mi + 1][8 * ki + 3] * scale;  // + 9
+        float tmp_14 = this->elt_[2 * mi + 1][8 * ki + 4] * scale;  // +16
+        float tmp_15 = this->elt_[2 * mi + 1][8 * ki + 5] * scale;  // +17
+        float tmp_16 = this->elt_[2 * mi + 1][8 * ki + 6] * scale;  // +24
+        float tmp_17 = this->elt_[2 * mi + 1][8 * ki + 7] * scale;  // +25
+
+        // Pack to 4 registers.
+        dst[ki][mi].reg(0) = fmha::float4_to_char4<false>(tmp_00, tmp_01, tmp_02, tmp_03);
+        dst[ki][mi].reg(1) = fmha::float4_to_char4<false>(tmp_10, tmp_11, tmp_12, tmp_13);
+        dst[ki][mi].reg(2) = fmha::float4_to_char4<false>(tmp_04, tmp_05, tmp_06, tmp_07);
+        dst[ki][mi].reg(3) = fmha::float4_to_char4<false>(tmp_14, tmp_15, tmp_16, tmp_17);
+      }
+    }
+  }
+
+  uint32_t const params_scale_softmax_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The softmax normalization statistics used by flash attention (l, m)
+template <typename Traits, typename Cta_tile>
+struct Softmax_statistics {
+  // The shape of the MMA tile.
+  using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+  // The number of MMAs in the M dimension.
+  enum { MMAS_M = Mma_tile::MMAS_M };
+
+  // Ctor.
+  template <typename Params, typename Binfo>
+  inline __device__ Softmax_statistics(Params const& params, void const* ptr, Binfo const& binfo,
+                                       int tidx)
+      : ptr_(reinterpret_cast<int8_t const*>(ptr)), seqlen_(binfo.actual_seqlen) {
+    // The decomposition of the thread index into warp/lane.
+    int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+    // The position of the the warp in the CTA.
+    int warp_m = warp % Cta_tile::WARPS_M;
+
+    // The position of the thread
+    token_ = warp_m * Mma_tile::M_PER_MMA + lane / 4;
+
+    // Compute the offset to the first token of the sequence.
+    int64_t offset = binfo.bidb * params.h + binfo.bidh;
+    // Move the pointer to the correct position.
+    ptr_ += offset * params.lse_stride_in_bytes;
+  }
+
+  // Load the bias into registers (and expand).
+  inline __device__ void load(int step) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ii = 0; ii < 2; ++ii) {
+        // The index of the token.
+        int token = token_;
+        // At each iteration we jump over STEPQ elements.
+        token += step * Cta_tile::M;
+        // The extra offset inside the CTA.
+        token += mi * Mma_tile::M_PER_MMA_PER_CTA + (ii & 0x1) * 8;
+
+        // Fetch the value if the token is valid.
+        float val = 0.0f;
+        if (token < seqlen_) {
+          val = reinterpret_cast<float const*>(ptr_)[token];
+        }
+        lm_[2 * mi + ii] = val;
+      }
+    }
+  }
+
+  // The pointer to the bias.
+  int8_t const* ptr_;
+  // The length of the sequence.
+  int const seqlen_;
+  // The token that this thread is loading.
+  int token_;
+  // The bias after expansion.
+  float lm_[MMAS_M * 2];
+};
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/traits.h b/csrc/fmha_v2/fmha/traits.h
new file mode 100644
index 0000000000..bb6f4b700d
--- /dev/null
+++ b/csrc/fmha_v2/fmha/traits.h
@@ -0,0 +1,942 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/utils.h>
+
+#include "fmha/numeric_types.h"
+
+#define FMHA_DIV_UP(m, n) (((m) + (n) - 1) / (n))
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Trait class for heuristically determining the tile sizes
+template <bool GRANULAR, int STEP, int S, int D, int DV, int K_PER_MMA>
+struct Traits_tile_size;
+
+template <int STEP, int S, int D, int DV, int K_PER_MMA>
+struct Traits_tile_size</* GRANULAR = */ false, STEP, S, D, DV, K_PER_MMA> {
+  enum {
+    CTA_P_TILE_M = STEP,
+    CTA_P_TILE_N = S,
+    CTA_P_TILE_K = D,
+    CTA_O_TILE_M = CTA_P_TILE_M,
+    CTA_O_TILE_N = DV,
+    CTA_O_TILE_K = S
+  };
+};
+
+template <int STEP, int S, int D, int DV, int K_PER_MMA>
+struct Traits_tile_size</* GRANULAR = */ true, STEP, S, D, DV, K_PER_MMA> {
+  enum {
+    CTA_P_TILE_M = STEP,
+    CTA_P_TILE_N = S,
+    // D =16: CTA_P_TILE_K=16
+    // D =32: CTA_P_TILE_K=32
+    // D>=64: CTA_P_TILE_K=64
+    CTA_P_TILE_K = D < 32 ? 16 : (D < 64 ? 32 : 64),
+    CTA_O_TILE_M = CTA_P_TILE_M,
+    // D =512: CTA_TILE_N=256
+    // D<=256: CTA_TILE_N=D
+    CTA_O_TILE_N = DV > 256 ? 256 : DV,
+    // D =512: CTA_O_TILE_K=16
+    // D =256: CTA_O_TILE_K=32
+    // D<=128: CTA_O_TILE_K=64
+    CTA_O_TILE_K = std::max(K_PER_MMA, DV > 256 ? 16 : (DV > 128 ? 32 : 64))
+  };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The GPU architecture.
+    typename Gpu_arch,
+    // The number of rows in the CTA tile.
+    int M_,
+    // The number of cols in the CTA tile.
+    int N_,
+    // The number of elements in the the K dimension of the GEMM loop.
+    int K_,
+    // The number of valid cols in the CTA tile.
+    int VALID_N_,
+    // The number of valid elements in the the K dimension of the GEMM loop.
+    int VALID_K_,
+    // The number of rows of warps.
+    int WARPS_M_,
+    // The number of cols of warps.
+    int WARPS_N_,
+    // The number of warps in the K dimension of the GEMM loop.
+    int WARPS_K_>
+struct Cta_tile_ {
+  enum { M = M_, N = N_, K = K_, VALID_N = VALID_N_, VALID_K = VALID_K_ };
+
+  // The number of warps.
+  enum { WARPS_M = WARPS_M_, WARPS_N = WARPS_N_, WARPS_K = WARPS_K_ };
+
+  // The number of warps per CTA.
+  enum { WARPS_PER_CTA = WARPS_M * WARPS_N * WARPS_K };
+
+  // The number of threads per warp.
+  enum { THREADS_PER_WARP = Gpu_arch::THREADS_PER_WARP };
+
+  // The number of threads per CTA.
+  enum { THREADS_PER_CTA = WARPS_PER_CTA * THREADS_PER_WARP };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The GPU architecture.
+    typename Gpu_arch_,
+    // The type of the elements of A.
+    typename A_type_,
+    // The type of the elements of B.
+    typename B_type_,
+    // The type of the elements of C.
+    typename C_type_,
+    // The type of the elements of the accumulators.
+    typename Accumulator_type_,
+    // The type of the elements of the epilogue.
+    typename Epilogue_type_>
+struct Traits {
+  // The architecture.
+  using Gpu_arch = Gpu_arch_;
+  // The data type for A elements.
+  using A_type = A_type_;
+  // The data type for B elements.
+  using B_type = B_type_;
+  // The data type for C elements.
+  using C_type = C_type_;
+  // The data type for accumulators.
+  using Accumulator_type = Accumulator_type_;
+  // The data type of the math in the epilogue.
+  using Epilogue_type = Epilogue_type_;
+
+  // Create the description of the CTA tile from a configuration.
+  template <int M, int N, int K, int VALID_N, int VALID_K, int WARPS_M, int WARPS_N, int WARPS_K>
+  using Cta_tile_extd = Cta_tile_<Gpu_arch, M, N, K, VALID_N, VALID_K, WARPS_M, WARPS_N, WARPS_K>;
+
+  // The number of bits per element of A.
+  enum { BITS_PER_ELEMENT_A = sizeof(A_type) * 8 };
+
+  // An offset in bytes for A.
+  static inline __host__ __device__ int64_t offset_in_bytes_a(int64_t offset) {
+    return offset * static_cast<int64_t>(sizeof(A_type));
+  }
+
+  // The number of bits per element of B.
+  enum { BITS_PER_ELEMENT_B = sizeof(B_type) * 8 };
+
+  // An offset in bytes for B.
+  static inline __host__ __device__ int64_t offset_in_bytes_b(int64_t offset) {
+    return offset * static_cast<int64_t>(sizeof(B_type));
+  }
+
+  // The number of bits per element of C.
+  enum { BITS_PER_ELEMENT_C = sizeof(C_type) * 8 };
+
+  // An offset in bytes for C.
+  static inline __host__ __device__ int64_t offset_in_bytes_c(int64_t offset) {
+    return offset * static_cast<int64_t>(sizeof(C_type));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Gpu_arch_base {
+  // By default, architectures have 32 threads per warp.
+  enum { THREADS_PER_WARP = 32 };
+
+  // By default, architectures do not support LDGSTS.
+  enum { HAS_LDGSTS = 0 };
+
+  // By default, architecture do not support super HMMA
+  enum { HAS_SUPER_HMMA = 0 };
+
+  // By default, architecture do not support TMA
+  enum { HAS_TMA = 0 };
+
+  // By default, architecture do not support GMMA
+  enum { HAS_GMMA = 0 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Traits_, typename Cta_tile_>
+using Cta_tile_with_k_with_padding = typename Traits_::template Cta_tile_extd<
+    Cta_tile_::M, Cta_tile_::N, Next_power_of_two<Cta_tile_::K>::VALUE, Cta_tile_::N,
+    Next_power_of_two<Cta_tile_::K>::VALUE, Cta_tile_::WARPS_M, Cta_tile_::WARPS_N,
+    Cta_tile_::WARPS_K>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Volta : public Gpu_arch_base {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int N_PER_MMA_ = 16, int K_PER_MMA_ = 8>
+struct Volta_mma_tile {
+  // The number of elements computed with a single warp-MMA.
+  enum { M_PER_MMA = 16, N_PER_MMA = N_PER_MMA_, K_PER_MMA = K_PER_MMA_ };
+
+  // The number of elements computed with a single CTA-MMA.
+  enum {
+    M_PER_MMA_PER_CTA = M_PER_MMA * Cta_tile::WARPS_M,
+    N_PER_MMA_PER_CTA = N_PER_MMA * Cta_tile::WARPS_N,
+    K_PER_MMA_PER_CTA = K_PER_MMA * Cta_tile::WARPS_K
+  };
+
+  // The number of MMAs needed to compute the GEMM.
+  enum {
+    MMAS_M = (Cta_tile::M + M_PER_MMA_PER_CTA - 1) / M_PER_MMA_PER_CTA,
+    MMAS_N = (Cta_tile::N + N_PER_MMA_PER_CTA - 1) / N_PER_MMA_PER_CTA,
+    MMAS_K = (Cta_tile::K + K_PER_MMA_PER_CTA - 1) / K_PER_MMA_PER_CTA
+  };
+
+  // The number of valid MMAs (for Head Size)
+  enum {
+    // tile o
+    VALID_MMAS_N = Div_up<Cta_tile::VALID_N, N_PER_MMA_PER_CTA>::VALUE,
+    // tile p
+    VALID_MMAS_K = Div_up<Cta_tile::VALID_K, K_PER_MMA_PER_CTA>::VALUE,
+  };
+
+  // The number of elements computed per warp.
+  enum {
+    M_PER_WARP = MMAS_M * M_PER_MMA,
+    N_PER_WARP = MMAS_N * N_PER_MMA,
+    K_PER_WARP = MMAS_K * K_PER_MMA,
+  };
+
+  // Do we enable the fast path for LDS.
+  enum { ENABLE_LDS_FAST_PATH = 0 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Volta_hmma_fp16_traits
+    : public Traits<Volta, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t> {
+  // The K_PER_MMA for Volta_hmma_fp16_traits is 8.
+  enum { K_PER_MMA = 8 };
+
+  // The MMA tile.
+  template <typename Cta_tile>
+  using Mma_tile = Volta_mma_tile<Cta_tile, 16, K_PER_MMA>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Volta_hmma_fp16_16x16x16_traits
+    : public Traits<Volta, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t> {
+  // The K_PER_MMA for Volta_hmma_fp16_16x16x16_traits is 16.
+  enum { K_PER_MMA = 16 };
+
+  // The MMA tile.
+  template <typename Cta_tile>
+  using Mma_tile = Volta_mma_tile<Cta_tile, 16, K_PER_MMA>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Volta_imma_int8_int32_traits : public Traits<Volta, int8_t, int8_t, int8_t, int32_t, float> {
+  // The K_PER_MMA for Volta_imma_int8_int32_traits is 16.
+  enum { K_PER_MMA = 16 };
+
+  // The MMA tile.
+  template <typename Cta_tile>
+  using Mma_tile = Volta_mma_tile<Cta_tile, 16, K_PER_MMA>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Turing : public Gpu_arch_base {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int K_PER_MMA_>
+struct Turing_mma_tile {
+  // The number of elements computed with a single warp-MMA.
+  enum { M_PER_MMA = 16, N_PER_MMA = 16, K_PER_MMA = K_PER_MMA_ };
+
+  // The number of elements computed with a single CTA-MMA.
+  enum {
+    M_PER_MMA_PER_CTA = M_PER_MMA * Cta_tile::WARPS_M,
+    N_PER_MMA_PER_CTA = N_PER_MMA * Cta_tile::WARPS_N,
+    K_PER_MMA_PER_CTA = K_PER_MMA * Cta_tile::WARPS_K
+  };
+
+  // The number of MMAs needed to compute the GEMM.
+  enum {
+    MMAS_M = Div_up<Cta_tile::M, M_PER_MMA_PER_CTA>::VALUE,
+    MMAS_N = Div_up<Cta_tile::N, N_PER_MMA_PER_CTA>::VALUE,
+    MMAS_K = Div_up<Cta_tile::K, K_PER_MMA_PER_CTA>::VALUE,
+  };
+
+  // The number of valid MMAs (for Head Size)
+  enum {
+    // tile o
+    VALID_MMAS_N = Div_up<Cta_tile::VALID_N, N_PER_MMA_PER_CTA>::VALUE,
+    // tile p
+    VALID_MMAS_K = Div_up<Cta_tile::VALID_K, K_PER_MMA_PER_CTA>::VALUE,
+  };
+
+  // The number of elements computed per warp.
+  enum {
+    M_PER_WARP = MMAS_M * M_PER_MMA,
+    N_PER_WARP = MMAS_N * N_PER_MMA,
+    K_PER_WARP = MMAS_K * K_PER_MMA,
+  };
+
+  // The distribution of threads in the output tile.
+  enum {
+    THREADS_PER_MMA_M = 8,
+    THREADS_PER_MMA_N = 4,
+  };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Turing_hmma_tile : public Turing_mma_tile<Cta_tile, 8> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Turing_hmma_fp16_traits
+    : public Traits<Turing, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t> {
+  // The K_PER_MMA for Turing_hmma_fp16_traits is 8.
+  enum { K_PER_MMA = 8 };
+
+  // The MMA tile.
+  template <typename Cta_tile>
+  using Mma_tile = Turing_hmma_tile<Cta_tile>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Turing_hmma_fp32_traits : public Traits<Turing, uint16_t, uint16_t, uint16_t, float, float> {
+  // The K_PER_MMA for Turing_hmma_fp32_traits is 8.
+  enum { K_PER_MMA = 8 };
+
+  // The MMA tile.
+  template <typename Cta_tile>
+  using Mma_tile = Turing_hmma_tile<Cta_tile>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Turing_imma_int8_tile : public Turing_mma_tile<Cta_tile, 16> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Turing_imma_int8_int32_traits
+    : public Traits<Turing, int8_t, int8_t, int8_t, int32_t, float> {
+  // The K_PER_MMA for Turing_imma_int8_int32_traits is 16.
+  enum { K_PER_MMA = 16 };
+
+  // The MMA tile.
+  template <typename Cta_tile>
+  using Mma_tile = Turing_imma_int8_tile<Cta_tile>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Ampere : public Gpu_arch_base {
+  // It has LDGSTS.
+  enum { HAS_LDGSTS = 1 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int K_PER_MMA = 16>
+struct Ampere_hmma_tile : public Turing_mma_tile<Cta_tile, K_PER_MMA> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Ampere_hmma_fp16_traits
+    : public Traits<Ampere, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t> {
+  // The K_PER_MMA for Ampere_hmma_fp16_traits is 16.
+  enum { K_PER_MMA = 16 };
+
+  // The MMA tile.
+  template <typename Cta_tile>
+  using Mma_tile = Ampere_hmma_tile<Cta_tile>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Ampere_hmma_fp32_traits
+    : public Traits<Ampere, uint16_t, uint16_t, uint16_t, float, uint16_t> {
+  // The K_PER_MMA for Ampere_hmma_fp32_traits is 16.
+  enum { K_PER_MMA = 16 };
+
+  // The MMA tile.
+  template <typename Cta_tile>
+  using Mma_tile = Ampere_hmma_tile<Cta_tile>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// used for Epilogue_type = bf16_t (similar to Ampere_hmma_fp16_traits).
+struct Ampere_hmma_bf16_bf16_traits
+    : public Traits<Ampere, bf16_t, bf16_t, bf16_t, bf16_t, bf16_t> {
+  // The K_PER_MMA for Ampere_hmma_bf16_bf16_traits is 16.
+  enum { K_PER_MMA = 16 };
+
+  // The MMA tile.
+  template <typename Cta_tile>
+  using Mma_tile = Ampere_hmma_tile<Cta_tile>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Ampere_hmma_bf16_traits : public Traits<Ampere, bf16_t, bf16_t, bf16_t, float, bf16_t> {
+  // The K_PER_MMA for Ampere_hmma_bf16_traits is 16.
+  enum { K_PER_MMA = 16 };
+
+  // The MMA tile.
+  template <typename Cta_tile>
+  using Mma_tile = Ampere_hmma_tile<Cta_tile>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Ampere_imma_int8_tile : public Turing_mma_tile<Cta_tile, 32> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Ampere_imma_int8_int32_traits
+    : public Traits<Ampere, int8_t, int8_t, int8_t, int32_t, float> {
+  // The K_PER_MMA for Ampere_imma_int8_int32_traits is 32.
+  enum { K_PER_MMA = 32 };
+
+  // The MMA tile.
+  template <typename Cta_tile>
+  using Mma_tile = Ampere_imma_int8_tile<Cta_tile>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Ada : public Gpu_arch_base {
+  // It has LDGSTS.
+  enum { HAS_LDGSTS = 1 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The following partial traits are mapped to Ampere_hmma_fp16_traits in fmha/kernel_traits.h.
+//
+// It is easier to implement setup.py this way.
+struct Ada_hmma_fp16_traits {};
+
+struct Ada_hmma_fp32_traits {};
+
+struct Ada_imma_int8_int32_traits {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Ada_qmma_fp8_tile : public Turing_mma_tile<Cta_tile, 32> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Ada_qmma_e4m3_fp16_traits : public Traits<Ada, e4m3_t, e4m3_t, e4m3_t, uint16_t, uint16_t> {
+  // The K_PER_MMA for Ada_qmma_e4m3_fp16_traits is 32.
+  enum { K_PER_MMA = 32 };
+
+  // The MMA tile.
+  template <typename Cta_tile>
+  using Mma_tile = Ada_qmma_fp8_tile<Cta_tile>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Ada_qmma_e4m3_fp32_traits : public Traits<Ada, e4m3_t, e4m3_t, e4m3_t, float, float> {
+  // The K_PER_MMA for Ada_qmma_e4m3_fp32_traits is 32.
+  enum { K_PER_MMA = 32 };
+
+  // The MMA tile.
+  template <typename Cta_tile>
+  using Mma_tile = Ada_qmma_fp8_tile<Cta_tile>;
+
+  static constexpr float SOFTMAX_FP_QUANT_SCALE = Softmax_fp_quant_scale<Traits::A_type>();
+  static constexpr float SOFTMAX_FP_DEQUANT_SCALE = 1.f / SOFTMAX_FP_QUANT_SCALE;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Hopper : public Gpu_arch_base {
+  // It has LDGSTS.
+  enum { HAS_LDGSTS = 1 };
+
+  // It has TMA.
+  enum { HAS_TMA = 1 };
+
+  // It has GMMA
+  enum { HAS_GMMA = 1 };
+
+  // for Hopper there are 4 warps per warpgroup.
+  enum { WARPS_PER_WARP_GROUP = 4 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Hopper related code.
+// SHOULD we move this to a different file??
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <int HEIGHT_ = 1, int WIDTH_ = 1, int DEPTH_ = 1>
+struct Hopper_cga_tile {
+  // The size of the CGA in terms of CTA
+  enum { CLUSTER_HEIGHT = HEIGHT_ };
+
+  enum { CLUSTER_WIDTH = WIDTH_ };
+
+  enum { CLUSTER_DEPTH = DEPTH_ };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Gpu_arch_,
+          int M_,             // CTA tile M
+          int N_,             // CTA tile N
+          int K_,             // CTA tile K
+          int VALID_N_,       // CTA tile valid N
+          int VALID_K_,       // CTA tile valid K
+          int WARP_GROUP_M_,  // Number of warp group along M dim
+          int WARP_GROUP_N_,  // Number of warp group along N dim
+          int WARP_GROUP_K_>  // Number of warp group along K dim
+struct Hopper_cta_tile {
+  // GPU arch.
+  using Gpu_arch = Gpu_arch_;
+
+  // The size of the CTA tile.
+  // TODO: support D (not power of 2)
+  enum { M = M_, N = N_, K = K_, VALID_N = VALID_N_, VALID_K = VALID_K_ };
+
+  // The number of warp groups.
+  enum { WARP_GROUP_M = WARP_GROUP_M_, WARP_GROUP_N = WARP_GROUP_N_, WARP_GROUP_K = WARP_GROUP_K_ };
+
+  // The number of warps in a warp group.
+  enum {
+    WARPS_M_PER_GROUP = 4,
+    WARPS_N_PER_GROUP = 1,
+    WARPS_K_PER_GROUP = 1,
+  };
+
+  // The number of warps in a cta.
+  enum {
+    WARPS_M = WARPS_M_PER_GROUP * WARP_GROUP_M_,
+    WARPS_N = WARPS_N_PER_GROUP * WARP_GROUP_N_,
+    WARPS_K = WARPS_K_PER_GROUP * WARP_GROUP_K_
+  };
+
+  // The number of warps per CTA.
+  enum {
+    WARPS_PER_CTA = WARP_GROUP_M * WARP_GROUP_N * WARP_GROUP_K * Gpu_arch::WARPS_PER_WARP_GROUP
+  };
+
+  // The number of warps per warpgroup.
+  enum { WARPS_PER_WARP_GROUP = Gpu_arch::WARPS_PER_WARP_GROUP };
+
+  // The number of threads per warp.
+  enum { THREADS_PER_WARP = Gpu_arch::THREADS_PER_WARP };
+
+  // the number of threads per warpgroup.
+  enum { THREADS_PER_WARP_GROUP = THREADS_PER_WARP * WARPS_PER_WARP_GROUP };
+
+  // The number of threads per CTA.
+  enum { THREADS_PER_CTA = WARPS_PER_CTA * THREADS_PER_WARP };
+
+  enum { GROUPS_M = 1 };
+
+  enum { GROUPS_N = 1 };
+
+  enum { GROUPS_K = 1 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int GMMA_M, int GMMA_N, int GMMA_K>
+struct Hopper_gmma_tile {
+  // The number of elements computed with a single warp group mma.
+  enum { M_PER_MMA = GMMA_M, N_PER_MMA = GMMA_N, K_PER_MMA = GMMA_K };
+
+  // The number of warp groups.
+  enum {
+    NUM_WARP_GROUPS = Cta_tile::WARP_GROUP_M * Cta_tile::WARP_GROUP_N * Cta_tile::WARP_GROUP_K
+  };
+
+  // The number of elements computed with a single CTA-MMA.
+  enum {
+    M_PER_MMA_PER_CTA = M_PER_MMA * Cta_tile::WARP_GROUP_M,
+    N_PER_MMA_PER_CTA = N_PER_MMA * Cta_tile::WARP_GROUP_N,
+    K_PER_MMA_PER_CTA = K_PER_MMA * Cta_tile::WARP_GROUP_K
+  };
+
+  // The number of MMAs needed to compute the GEMM.
+  enum {
+    MMAS_M = (Cta_tile::M + M_PER_MMA_PER_CTA - 1) / M_PER_MMA_PER_CTA,
+    MMAS_N = (Cta_tile::N + N_PER_MMA_PER_CTA - 1) / N_PER_MMA_PER_CTA,
+    MMAS_K = (Cta_tile::K + K_PER_MMA_PER_CTA - 1) / K_PER_MMA_PER_CTA,
+  };
+
+  // The number of valid MMAs (for Head Size)
+  enum {
+    // tile o
+    VALID_MMAS_N = Div_up<Cta_tile::VALID_N, N_PER_MMA_PER_CTA>::VALUE,
+    // tile p
+    VALID_MMAS_K = Div_up<Cta_tile::VALID_K, K_PER_MMA_PER_CTA>::VALUE,
+  };
+
+  // The number of elements computed per warp group.
+  enum {
+    M_PER_WARP_GROUP = MMAS_M * M_PER_MMA,
+    N_PER_WARP_GROUP = MMAS_N * N_PER_MMA,
+    K_PER_WARP_GROUP = MMAS_K * K_PER_MMA,
+  };
+
+  // the size of GMMA group, which is GMMA_M x GMMA_N x Kblock.
+  enum {
+    M_PER_GMMA_GROUP = GMMA_M,
+    N_PER_GMMA_GROUP = GMMA_N,
+    K_PER_GMMA_GROUP = Cta_tile::K,
+  };
+
+  // The distribution of threads in the output tile.
+  // TODO
+  enum {
+    THREADS_PER_MMA_M = 8,
+    THREADS_PER_MMA_N = 4,
+  };
+
+  // The number of core matrices per GMMA.
+  enum {
+    CORES_M_PER_GROUP = 8 * Cta_tile::WARPS_M_PER_GROUP,
+    CORES_N_PER_GROUP = 8 * Cta_tile::WARPS_N_PER_GROUP,
+    CORES_M = GMMA_M / CORES_M_PER_GROUP,
+    CORES_N = GMMA_N / CORES_N_PER_GROUP,
+  };
+
+  // The number of logical rows/cols per thread.
+  enum {
+    // A thread owns 1 row per core matrix.
+    ROWS_PER_THREAD = CORES_M,
+    // A thread owns 2 col per core matrix.
+    COLS_PER_THREAD = CORES_N * 2,
+  };
+
+  static_assert(ROWS_PER_THREAD == 2);
+  static_assert(COLS_PER_THREAD == GMMA_N / 4);
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class Hopper_instructions {
+  HGMMA_FP16,
+  HGMMA_BF16,
+  HGMMA_FP32,
+  IGMMA_INT32,
+  QGMMA_E4M3_FP32,
+  QGMMA_E5M2_FP32,
+  QGMMA_E4M3_FP16,
+  QGMMA_E5M2_FP16
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Hopper HGMMA FP16 Traits
+template <int GMMA_M_,      // GMMA instruction shape in M dim
+          int GMMA_N_,      // GMMA instruction shape in N dim
+          int GMMA_K_,      // GMMA instruction shape in K dim
+          bool GMMA_A_RF_,  // GMMA A operand coming from RF?
+          bool GMMA_B_RF_   // GMMA B operand coming from RF?
+          >
+struct Hopper_hgmma_fp16_traits
+    : public Traits<Hopper, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t> {
+  // The GMMA shape.
+  enum { GMMA_M = GMMA_M_, GMMA_N = GMMA_N_, GMMA_K = 16 };
+
+  // is A operand in RF for GMMA?
+  static constexpr bool GMMA_A_RF = GMMA_A_RF_;
+
+  // is B operand in RF for GMMA?
+  static constexpr bool GMMA_B_RF = GMMA_B_RF_;
+
+  // GMMA shape has certain requirements.
+  static_assert(GMMA_K == 16, "GMMA K must be 16; this might change");
+  static_assert(GMMA_M == 64, "GMMA M must be 64; this might change");
+  static_assert(GMMA_N % 8 == 0, "GMMA N must be multiple of 8; this might change");
+  static_assert(GMMA_N <= 256, "GMMA N must be no larger than 256; this might change");
+
+  // GMMA does not allow both operands coming from RF.
+  static_assert((GMMA_A_RF && GMMA_B_RF) != true,
+                "GMMA does not allow both operands coming from RF.");
+
+  // The Cta tile.
+  template <int M, int N, int K, int Warpgroup_M, int Warpgroup_N, int Warpgroup_K>
+  using Cta_tile = Hopper_cta_tile<Hopper, M, N, K, N, K, Warpgroup_M, Warpgroup_N, Warpgroup_K>;
+
+  // The Cta tile.
+  template <int M, int N, int K, int VALID_N, int VALID_K, int Warpgroup_M, int Warpgroup_N,
+            int Warpgroup_K>
+  using Cta_padded_tile =
+      Hopper_cta_tile<Hopper, M, N, K, VALID_N, VALID_K, Warpgroup_M, Warpgroup_N, Warpgroup_K>;
+
+  // The CGA Tile
+  template <int HEIGHT = 1, int WIDTH = 1, int DEPTH = 1>
+  using Cga_tile = Hopper_cga_tile<HEIGHT, WIDTH, DEPTH>;
+
+  // The MMA tile.
+  template <typename Cta_tile>
+  using Mma_tile = Hopper_gmma_tile<Cta_tile, GMMA_M, GMMA_N, GMMA_K>;
+
+  // The handle to differentiate instructions.
+  static constexpr fmha::Hopper_instructions HOPPER_INSTRUCTION =
+      fmha::Hopper_instructions::HGMMA_FP16;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Hopper HGMMA FP32 Traits
+template <int GMMA_M_,      // GMMA instruction shape in M dim
+          int GMMA_N_,      // GMMA instruction shape in N dim
+          int GMMA_K_,      // GMMA instruction shape in K dim
+          bool GMMA_A_RF_,  // GMMA A operand coming from RF?
+          bool GMMA_B_RF_   // GMMA B operand coming from RF?
+          >
+struct Hopper_hgmma_fp32_traits
+    : public Traits<Hopper, uint16_t, uint16_t, uint16_t, float, uint16_t> {
+  // The GMMA shape.
+  enum { GMMA_M = GMMA_M_, GMMA_N = GMMA_N_, GMMA_K = 16 };
+
+  // is A operand in RF for GMMA?
+  static constexpr bool GMMA_A_RF = GMMA_A_RF_;
+
+  // is B operand in RF for GMMA?
+  static constexpr bool GMMA_B_RF = GMMA_B_RF_;
+
+  // GMMA shape has certain requirements.
+  static_assert(GMMA_K == 16, "GMMA K must be 16; this might change");
+  static_assert(GMMA_M == 64, "GMMA M must be 64; this might change");
+  static_assert(GMMA_N % 8 == 0, "GMMA N must be multiple of 8; this might change");
+  static_assert(GMMA_N <= 256, "GMMA N must be no larger than 256; this might change");
+
+  // GMMA does not allow both operands coming from RF.
+  static_assert((GMMA_A_RF && GMMA_B_RF) != true,
+                "GMMA does not allow both operands coming from RF.");
+
+  // The Cta tile.
+  template <int M, int N, int K, int Warpgroup_M, int Warpgroup_N, int Warpgroup_K>
+  using Cta_tile = Hopper_cta_tile<Hopper, M, N, K, N, K, Warpgroup_M, Warpgroup_N, Warpgroup_K>;
+
+  // The Cta tile.
+  template <int M, int N, int K, int VALID_N, int VALID_K, int Warpgroup_M, int Warpgroup_N,
+            int Warpgroup_K>
+  using Cta_padded_tile =
+      Hopper_cta_tile<Hopper, M, N, K, VALID_N, VALID_K, Warpgroup_M, Warpgroup_N, Warpgroup_K>;
+
+  // The CGA Tile
+  template <int HEIGHT = 1, int WIDTH = 1, int DEPTH = 1>
+  using Cga_tile = Hopper_cga_tile<HEIGHT, WIDTH, DEPTH>;
+
+  // The MMA tile.
+  template <typename Cta_tile>
+  using Mma_tile = Hopper_gmma_tile<Cta_tile, GMMA_M, GMMA_N, GMMA_K>;
+
+  // The handle to differentiate instructions.
+  static constexpr fmha::Hopper_instructions HOPPER_INSTRUCTION =
+      fmha::Hopper_instructions::HGMMA_FP32;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Hopper BF16 HGMMA Traits
+template <int GMMA_M_,      // GMMA instruction shape in M dim
+          int GMMA_N_,      // GMMA instruction shape in N dim
+          int GMMA_K_,      // GMMA instruction shape in K dim
+          bool GMMA_A_RF_,  // GMMA A operand coming from RF?
+          bool GMMA_B_RF_   // GMMA B operand coming from RF?
+          >
+struct Hopper_hgmma_bf16_traits : public Traits<Hopper, bf16_t, bf16_t, bf16_t, float, bf16_t> {
+  // The GMMA shape.
+  enum { GMMA_M = GMMA_M_, GMMA_N = GMMA_N_, GMMA_K = 16 };
+
+  // is A operand in RF for GMMA?
+  static constexpr bool GMMA_A_RF = GMMA_A_RF_;
+
+  // is B operand in RF for GMMA?
+  static constexpr bool GMMA_B_RF = GMMA_B_RF_;
+
+  // GMMA shape has certain requirements.
+  static_assert(GMMA_K == 16, "GMMA K must be 16; this might change");
+  static_assert(GMMA_M == 64, "GMMA M must be 64; this might change");
+  static_assert(GMMA_N % 8 == 0, "GMMA N must be multiple of 8; this might change");
+  static_assert(GMMA_N <= 256, "GMMA N must be no larger than 256; this might change");
+
+  // GMMA does not allow both operands coming from RF.
+  static_assert((GMMA_A_RF && GMMA_B_RF) != true,
+                "GMMA does not allow both operands coming from RF.");
+
+  // The Cta tile.
+  template <int M, int N, int K, int Warpgroup_M, int Warpgroup_N, int Warpgroup_K>
+  using Cta_tile = Hopper_cta_tile<Hopper, M, N, K, N, K, Warpgroup_M, Warpgroup_N, Warpgroup_K>;
+
+  // The Cta tile.
+  template <int M, int N, int K, int VALID_N, int VALID_K, int Warpgroup_M, int Warpgroup_N,
+            int Warpgroup_K>
+  using Cta_padded_tile =
+      Hopper_cta_tile<Hopper, M, N, K, VALID_N, VALID_K, Warpgroup_M, Warpgroup_N, Warpgroup_K>;
+
+  // The CGA Tile
+  template <int HEIGHT = 1, int WIDTH = 1, int DEPTH = 1>
+  using Cga_tile = Hopper_cga_tile<HEIGHT, WIDTH, DEPTH>;
+
+  // The MMA tile.
+  template <typename Cta_tile>
+  using Mma_tile = Hopper_gmma_tile<Cta_tile, GMMA_M, GMMA_N, GMMA_K>;
+
+  // The handle to differentiate instructions.
+  static constexpr fmha::Hopper_instructions HOPPER_INSTRUCTION =
+      fmha::Hopper_instructions::HGMMA_BF16;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Hopper IGMMA Traits
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M_,      // GMMA instruction shape in M dim
+          int GMMA_N_,      // GMMA instruction shape in N dim
+          int GMMA_K_,      // GMMA instruction shape in K dim
+          bool GMMA_A_RF_,  // GMMA A operand coming from RF?
+          bool GMMA_B_RF_   // GMMA B operand coming from RF?
+          >
+struct Hopper_igmma_int8_int32_traits
+    : public Traits<Hopper, int8_t, int8_t, int8_t, int32_t, float> {
+  using Base = Traits<Hopper, int8_t, int8_t, int8_t, int32_t, float>;
+
+  // The GMMA shape
+  enum { GMMA_M = GMMA_M_ };
+
+  enum { GMMA_N = GMMA_N_ };
+
+  enum { GMMA_K = 32 };
+
+  // is A operand in RF for GMMA?
+  static constexpr bool GMMA_A_RF = GMMA_A_RF_;
+
+  // is B operand in RF for GMMA?
+  static constexpr bool GMMA_B_RF = GMMA_B_RF_;
+
+  // GMMA shape has certain requirement
+  static_assert(GMMA_K == 32, "GMMA K must be 32; this might change");
+  static_assert(GMMA_M == 64, "GMMA M must be 64; this might change");
+  static_assert(GMMA_N % 8 == 0, "GMMA N must be multiple of 8; this might change");
+  static_assert(GMMA_N <= 256, "GMMA N must be no larger than 256; this might change");
+
+  // GMMA does not allow both operands coming from RF.
+  static_assert((GMMA_A_RF && GMMA_B_RF) != true,
+                "GMMA does not allow both operands coming from RF.");
+
+  // The Cta tile.
+  template <int M, int N, int K, int Warpgroup_M, int Warpgroup_N, int Warpgroup_K>
+  using Cta_tile = Hopper_cta_tile<Hopper, M, N, K, N, K, Warpgroup_M, Warpgroup_N, Warpgroup_K>;
+
+  // The Cta tile.
+  template <int M, int N, int K, int VALID_N, int VALID_K, int Warpgroup_M, int Warpgroup_N,
+            int Warpgroup_K>
+  using Cta_padded_tile =
+      Hopper_cta_tile<Hopper, M, N, K, VALID_N, VALID_K, Warpgroup_M, Warpgroup_N, Warpgroup_K>;
+
+  // The CGA Tile
+  template <int HEIGHT = 1, int WIDTH = 1, int DEPTH = 1>
+  using Cga_tile = Hopper_cga_tile<HEIGHT, WIDTH, DEPTH>;
+
+  // The MMA tile.
+  template <typename Cta_tile>
+  using Mma_tile = Hopper_gmma_tile<Cta_tile, GMMA_M, GMMA_N, GMMA_K>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Hopper QGMMA Traits
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int GMMA_M_,      // GMMA instruction shape in M dim
+          int GMMA_N_,      // GMMA instruction shape in N dim
+          int GMMA_K_,      // GMMA instruction shape in K dim
+          bool GMMA_A_RF_,  // GMMA A operand coming from RF?
+          bool GMMA_B_RF_,  // GMMA B operand coming from RF?
+          typename Input_type_A_ = e4m3_t, typename Input_type_B_ = e4m3_t,
+          typename Output_type_ = e4m3_t>
+struct Hopper_qgmma_fp8_fp32_traits
+    : public Traits<Hopper, Input_type_A_, Input_type_B_, Output_type_, float, float> {
+  using Base = Traits<Hopper, Input_type_A_, Input_type_B_, Output_type_, float, float>;
+
+  using Input_type_A = Input_type_A_;
+  using Input_type_B = Input_type_B_;
+  using Output_type = Output_type_;
+
+  // The GMMA shape
+  enum { GMMA_M = GMMA_M_ };
+
+  enum { GMMA_N = GMMA_N_ };
+
+  enum { GMMA_K = 32 };
+
+  // is A operand in RF for GMMA?
+  static constexpr bool GMMA_A_RF = GMMA_A_RF_;
+
+  // is B operand in RF for GMMA?
+  static constexpr bool GMMA_B_RF = GMMA_B_RF_;
+
+  // GMMA shape has certain requirement
+  static_assert(GMMA_K == 32, "GMMA K must be 32; this might change");
+  static_assert(GMMA_M == 64, "GMMA M must be 64; this might change");
+  static_assert(GMMA_N % 8 == 0, "GMMA N must be multiple of 8; this might change");
+  static_assert(GMMA_N <= 256, "GMMA N must be no larger than 256; this might change");
+
+  // GMMA does not allow both operands coming from RF.
+  static_assert((GMMA_A_RF && GMMA_B_RF) != true,
+                "GMMA does not allow both operands coming from RF.");
+
+  // The Cta tile.
+  template <int M, int N, int K, int Warpgroup_M, int Warpgroup_N, int Warpgroup_K>
+  using Cta_tile = Hopper_cta_tile<Hopper, M, N, K, N, K, Warpgroup_M, Warpgroup_N, Warpgroup_K>;
+
+  // The Cta tile.
+  template <int M, int N, int K, int VALID_N, int VALID_K, int Warpgroup_M, int Warpgroup_N,
+            int Warpgroup_K>
+  using Cta_padded_tile =
+      Hopper_cta_tile<Hopper, M, N, K, VALID_N, VALID_K, Warpgroup_M, Warpgroup_N, Warpgroup_K>;
+
+  // The CGA Tile
+  template <int HEIGHT = 1, int WIDTH = 1, int DEPTH = 1>
+  using Cga_tile = Hopper_cga_tile<HEIGHT, WIDTH, DEPTH>;
+
+  // The XMMA tile.
+  template <typename Cta_tile>
+  using Mma_tile = Hopper_gmma_tile<Cta_tile, GMMA_M, GMMA_N, GMMA_K>;
+
+  // Used by low precision floating point types (e4m3, e5m2, etc.)
+  static constexpr float SOFTMAX_FP_QUANT_SCALE = Softmax_fp_quant_scale<Input_type_A_>();
+  static constexpr float SOFTMAX_FP_DEQUANT_SCALE = 1.f / SOFTMAX_FP_QUANT_SCALE;
+};
+
+template <int GMMA_M,      // GMMA instruction shape in M dim
+          int GMMA_N,      // GMMA instruction shape in N dim
+          int GMMA_K,      // GMMA instruction shape in K dim
+          bool GMMA_A_RF,  // GMMA A operand coming from RF?
+          bool GMMA_B_RF   // GMMA B operand coming from RF?
+          >
+using Hopper_qgmma_e4m3_fp32_traits =
+    Hopper_qgmma_fp8_fp32_traits<GMMA_M, GMMA_N, GMMA_K, GMMA_A_RF, GMMA_B_RF, e4m3_t, e4m3_t,
+                                 e4m3_t>;
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/utils.h b/csrc/fmha_v2/fmha/utils.h
new file mode 100644
index 0000000000..f65d2fe661
--- /dev/null
+++ b/csrc/fmha_v2/fmha/utils.h
@@ -0,0 +1,2355 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <cuda_fp16.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#if defined(__CLANGD__)
+#include <__clang_cuda_builtin_vars.h>
+#include <__clang_cuda_math.h>
+#endif
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#include <cuda_bf16.h>
+#endif
+
+// include warpgroup related instructions, used by SM90.
+#include <fmha/hopper/utils_warpgroup.h>
+// include gmma related instructions, used by SM90.
+#include <fmha/hopper/utils_gmma.h>
+// include tma related instructions, used by SM90.
+#include <fmha/hopper/utils_tma.h>
+
+#include "fmha/numeric_types.h"
+
+#define FP32_I2F_MAGIC_NUMBER 12582912.f
+#define FP32_I2F_MAGIC_NUMBER_HEX 0x4b400000
+
+extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void* ptr);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace introspection {
+
+template <int... Ns>
+struct Unpack;
+
+template <int N>
+struct Unpack<N> {
+  // if we simply static_assert(false) then compiler will not emit template params upon failure
+  static_assert(N < INT_MIN, "");
+  using Type = std::integral_constant<int, N>;
+};
+
+template <int N, int... Ns>
+struct Unpack<N, Ns...> {
+  using Type = Unpack<N, Ns...>;
+  using Unpack_first = typename Unpack<N>::Type;
+  using Unpack_remaining = typename Unpack<Ns...>::Type;
+};
+
+}  // namespace introspection
+
+// Example usage:
+//
+//   Inspect_ns<(int)USE_LDGSTS_, PRED_REGS, (int)IS_HOPPER> foo;
+//
+// or
+//
+//   Inspect_ns<(int)USE_LDGSTS_, PRED_REGS, (int)IS_HOPPER>{}.foo();
+//
+// Output by nvcc:
+//
+//   ./src/fmha/gmem_tile_qkv_packed.h(70): error: static assertion failed with ""
+//             detected during:
+//               instantiation of class "fmha::v2::Unpack<N> [with N=1]"
+//   (77): here
+//               instantiation of class "fmha::v2::Unpack<N, Ns...> [with N=1, Ns=<2, 0>]"
+//   (84): here
+//               instantiation of class "fmha::v2::Inspect_ns<Ns...> [with Ns=<1, 2, 0>]"
+//   (143): here
+template <int... Ns>
+struct Inspect_ns {
+  using Type = typename introspection::Unpack<Ns...>::Type;
+};
+
+// Can be used alongside with static_assert() to figure out the conditions when assertion failed
+// Example:
+//
+//   Cond_inspect_ns< (int)ROWS >= (int)ROWS_PER_LDG, ROWS, ROWS_PER_LDG> foo;
+//
+// Output by nvcc (when condition is not met):
+//
+//   ./src/fmha/utils.h(163): error: static assertion failed with ""
+//             detected during:
+//               instantiation of class "Cond_inspect_ns<COND, Ns...> [with COND=false, Ns=<32,
+//               64>]"
+template <bool COND, int... Ns>
+struct Cond_inspect_ns {
+  static_assert(COND, "");
+};
+
+// Example:
+//
+//    Inspect_type<Mma_tile_p>{}.foo();
+//
+// or
+//
+//    Inspect_type<Mma_tile_p> foo;
+//
+// Output by nvcc:
+//
+//   ./src/fmha/utils.h(189): error: class "fmha::Ampere_hmma_tile<fmha::Cta_tile_<fmha::Ampere, 64,
+//   128, 64, 128, 256, 4, 1, 1>, 16>" has no member "Dummy"
+//             detected during:
+//               instantiation of class "Inspect_type<T> [with
+//               T=fmha::Ampere_hmma_tile<fmha::Cta_tile_<fmha::Ampere, 64, 128, 64, 128, 256, 4, 1,
+//               1>, 16>]"
+template <typename T>
+struct Inspect_type {
+  // Purposefully trigger error by referencing non-existent T::Dummy
+  using Dummy = typename T::Dummy;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Row {
+  static constexpr bool COL = false;
+  static constexpr bool ROW = true;
+};
+
+struct Col {
+  static constexpr bool COL = true;
+  static constexpr bool ROW = false;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int M, int N>
+struct Round_up {
+  enum { VALUE = (M + N - 1) / N * N };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N_, int H_, int W_>
+struct Tile_nhw {
+  enum { N = N_, H = H_, W = W_ };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int M, bool = (M & (M - 1)) == 0>
+struct Next_power_of_two {};
+
+template <int M>
+struct Next_power_of_two<M, true> {
+  enum { VALUE = M };
+};
+
+template <>
+struct Next_power_of_two<3, false> {
+  enum { VALUE = 4 };
+};
+
+template <>
+struct Next_power_of_two<5, false> {
+  enum { VALUE = 8 };
+};
+
+template <>
+struct Next_power_of_two<6, false> {
+  enum { VALUE = 8 };
+};
+
+template <>
+struct Next_power_of_two<7, false> {
+  enum { VALUE = 8 };
+};
+
+template <>
+struct Next_power_of_two<9, false> {
+  enum { VALUE = 16 };
+};
+
+template <>
+struct Next_power_of_two<10, false> {
+  enum { VALUE = 16 };
+};
+
+template <>
+struct Next_power_of_two<11, false> {
+  enum { VALUE = 16 };
+};
+
+template <>
+struct Next_power_of_two<12, false> {
+  enum { VALUE = 16 };
+};
+
+template <>
+struct Next_power_of_two<13, false> {
+  enum { VALUE = 16 };
+};
+
+template <>
+struct Next_power_of_two<14, false> {
+  enum { VALUE = 16 };
+};
+
+template <>
+struct Next_power_of_two<15, false> {
+  enum { VALUE = 16 };
+};
+
+template <>
+struct Next_power_of_two<24, false> {
+  enum { VALUE = 32 };
+};
+
+template <>
+struct Next_power_of_two<40, false> {
+  enum { VALUE = 64 };
+};
+
+template <>
+struct Next_power_of_two<48, false> {
+  enum { VALUE = 64 };
+};
+
+template <>
+struct Next_power_of_two<72, false> {
+  enum { VALUE = 128 };
+};
+
+template <>
+struct Next_power_of_two<80, false> {
+  enum { VALUE = 128 };
+};
+
+template <>
+struct Next_power_of_two<96, false> {
+  enum { VALUE = 128 };
+};
+
+template <>
+struct Next_power_of_two<104, false> {
+  enum { VALUE = 128 };
+};
+
+template <>
+struct Next_power_of_two<112, false> {
+  enum { VALUE = 128 };
+};
+
+template <>
+struct Next_power_of_two<144, false> {
+  enum { VALUE = 256 };
+};
+
+template <>
+struct Next_power_of_two<160, false> {
+  enum { VALUE = 256 };
+};
+
+template <>
+struct Next_power_of_two<192, false> {
+  enum { VALUE = 256 };
+};
+
+template <>
+struct Next_power_of_two<576, false> {
+  enum { VALUE = 1024 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, bool = (N & (N - 1)) == 0>
+struct Prev_power_of_two {};
+
+template <int N>
+struct Prev_power_of_two<N, true> {
+  enum { VALUE = N };
+};
+
+template <>
+struct Prev_power_of_two<3, false> {
+  enum { VALUE = 2 };
+};
+
+template <>
+struct Prev_power_of_two<5, false> {
+  enum { VALUE = 4 };
+};
+
+template <>
+struct Prev_power_of_two<6, false> {
+  enum { VALUE = 4 };
+};
+
+template <>
+struct Prev_power_of_two<7, false> {
+  enum { VALUE = 4 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int BYTES_PER_ROW, int SKEW>
+struct Compute_skew {
+  // The size of a transaction.
+  enum { BYTES_PER_TRX = 128 };
+
+  // The remainder of the row without skew.
+  enum { REMAINDER = BYTES_PER_ROW % BYTES_PER_TRX };
+
+  // The value.
+  enum { VALUE = REMAINDER <= SKEW ? SKEW - REMAINDER : BYTES_PER_TRX + SKEW - REMAINDER };
+
+  // Make sure the math works ;)
+  static_assert((BYTES_PER_ROW + VALUE) % BYTES_PER_TRX == SKEW, "");
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int BYTES_PER_ROW>
+struct Compute_skew<BYTES_PER_ROW, 128> {
+  // No skew!
+  enum { VALUE = 0 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int M, int N>
+struct Div_up {
+  enum { VALUE = (M + N - 1) / N };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int A, int B>
+struct Max {
+  enum { VALUE = A >= B ? A : B };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int A, int B, int C>
+struct Max_3 {
+  enum { VALUE = Max<Max<A, B>::VALUE, C>::VALUE };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int A, int B>
+struct Min {
+  enum { VALUE = A <= B ? A : B };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int SIZE_IN_BYTES>
+struct Uint_from_size_in_bytes {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Uint_from_size_in_bytes<1> {
+  using Type = uint8_t;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Uint_from_size_in_bytes<2> {
+  using Type = uint16_t;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Uint_from_size_in_bytes<4> {
+  using Type = uint32_t;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Uint_from_size_in_bytes<8> {
+  using Type = uint2;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Uint_from_size_in_bytes<16> {
+  using Type = uint4;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int WARPS_M, int WARPS_N, int WARPS_K>
+struct Warp_masks {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Warp_masks<8, 1, 1> {
+  enum { M = 0xe0, N = 0x00, K = 0x00 };
+};
+
+template <>
+struct Warp_masks<4, 2, 1> {
+  enum { M = 0x60, N = 0x80, K = 0x00 };
+};
+
+template <>
+struct Warp_masks<4, 1, 2> {
+  enum { M = 0x60, N = 0x00, K = 0x80 };
+};
+
+template <>
+struct Warp_masks<4, 1, 1> {
+  enum { M = 0x60, N = 0x00, K = 0x00 };
+};
+
+template <>
+struct Warp_masks<2, 4, 1> {
+  enum { M = 0x20, N = 0xc0, K = 0x00 };
+};
+
+template <>
+struct Warp_masks<2, 2, 2> {
+  enum { M = 0x20, N = 0x40, K = 0x80 };
+};
+
+template <>
+struct Warp_masks<2, 2, 1> {
+  enum { M = 0x20, N = 0x40, K = 0x00 };
+};
+
+template <>
+struct Warp_masks<2, 1, 2> {
+  enum { M = 0x20, N = 0x00, K = 0x40 };
+};
+
+template <>
+struct Warp_masks<2, 1, 1> {
+  enum { M = 0x20, N = 0x00, K = 0x00 };
+};
+
+template <>
+struct Warp_masks<1, 8, 1> {
+  enum { M = 0x00, N = 0xe0, K = 0x00 };
+};
+
+template <>
+struct Warp_masks<1, 4, 2> {
+  enum { M = 0x00, N = 0x60, K = 0x80 };
+};
+
+template <>
+struct Warp_masks<1, 4, 1> {
+  enum { M = 0x00, N = 0x60, K = 0x00 };
+};
+
+template <>
+struct Warp_masks<1, 2, 2> {
+  enum { M = 0x00, N = 0x20, K = 0x40 };
+};
+
+template <>
+struct Warp_masks<1, 2, 1> {
+  enum { M = 0x00, N = 0x20, K = 0x00 };
+};
+
+template <>
+struct Warp_masks<1, 1, 4> {
+  enum { M = 0x00, N = 0x00, K = 0x60 };
+};
+
+template <>
+struct Warp_masks<1, 1, 2> {
+  enum { M = 0x00, N = 0x00, K = 0x20 };
+};
+
+template <>
+struct Warp_masks<1, 1, 1> {
+  enum { M = 0x00, N = 0x00, K = 0x00 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+inline __device__ __host__ T div_up(T m, T n) {
+  return (m + n - 1) / n;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline int clz(int x) {
+  for (int i = 31; i >= 0; --i) {
+    if ((1 << i) & x) {
+      return 31 - i;
+    }
+  }
+  return 32;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline int find_log_2(int x, bool round_up = false) {
+  int a = 31 - clz(x);
+  if (round_up) {
+    a += (x & (x - 1)) ? 1 : 0;
+  }
+  return a;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline void find_divisor(uint32_t& mul, uint32_t& shr, int x) {
+  assert(x != 0);
+  if (x == 1) {
+    // If dividing by 1, reduced math doesn't work because mul_coeff would need to be 2^32,
+    // which doesn't fit into unsigned int.  the div() routine handles this special case
+    // separately.
+    mul = 0;
+    shr = 0;
+  } else {
+    // To express the division N/D in terms of a multiplication, what we first
+    // imagine is simply N*(1/D).  However, 1/D will always evaluate to 0 (for D>1),
+    // so we need another way.  There's nothing that says we have to use exactly
+    // the fraction 1/D; instead it could be any X/Y that reduces to 1/D (i.e.,
+    // Y=X*D), or at least to "close enough" to it.  If we pick Y that is a power
+    // of two, then the N*(X/Y) can be N*X followed by a right-shift by some amount.
+    // The power of two we should pick should be at least 2^32, because in the
+    // div() routine we'll use umulhi(), which returns only the upper 32 bits --
+    // this being equivalent to a right-shift by 32.  But we might want a higher
+    // power of two for better accuracy depending on the magnitude of the denominator.
+    // Once we've picked Y, then X [our mul_coeff value] is simply Y/D, rounding up,
+    // and we save shift_coeff as whatever further shift we have to do beyond
+    // what the umulhi() implies.
+    uint32_t p = 31 + find_log_2(x, true);
+    uint32_t m = (uint32_t)(((1ull << p) + (uint32_t)x - 1) / (uint32_t)x);
+
+    mul = m;
+    shr = p - 32;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void fast_divmod(int& div, int& mod, int x, int y, uint32_t mul, uint32_t shr) {
+  if (y == 1) {
+    div = x;
+    mod = 0;
+  } else {
+    div = __umulhi((uint32_t)x, mul) >> shr;
+    mod = x - div * y;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hadd2(uint32_t a, uint32_t b) {
+  uint32_t c;
+  asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t bfadd2(uint32_t a, uint32_t b) {
+  uint32_t c;
+  uint32_t one = 0x3f803f80;
+  ;
+  asm volatile("fma.rn.bf16x2 %0, %1, %3, %2;\n" : "=r"(c) : "r"(a), "r"(b), "r"(one));
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hmax2(uint32_t a, uint32_t b) {
+  uint32_t c;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  asm volatile("max.f16x2 %0, %1, %2;" : "=r"(c) : "r"(a), "r"(b));
+#else
+  asm volatile(
+      "{\n"
+      "\t .reg .f16x2 sela, selb;\n"
+      "\n"
+      "\t set.ge.f16x2.f16x2 sela, %1, %2;\n"
+      "\t set.gt.f16x2.f16x2 selb, %2, %1;\n"
+      "\n"
+      "\t mul.f16x2 %0, sela, %1;\n"
+      "\t fma.rn.f16x2 %0, selb, %2, %0;\n"
+      "}\n"
+      : "=r"(c)
+      : "r"(a), "r"(b));
+#endif
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint2 hmax4(uint2 a, uint2 b) {
+  uint2 c;
+  c.x = hmax2(a.x, b.x);
+  c.y = hmax2(a.y, b.y);
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint4 hmax8(uint4 a, uint4 b) {
+  uint4 c;
+  c.x = hmax2(a.x, b.x);
+  c.y = hmax2(a.y, b.y);
+  c.z = hmax2(a.z, b.z);
+  c.w = hmax2(a.w, b.w);
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hmin2(uint32_t a, uint32_t b) {
+  uint32_t c;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  asm volatile("min.f16x2 %0, %1, %2;" : "=r"(c) : "r"(a), "r"(b));
+#else
+  asm volatile(
+      "{\n"
+      "\t .reg .f16x2 sela, selb;\n"
+      "\n"
+      "\t set.le.f16x2.f16x2 sela, %1, %2;\n"
+      "\t set.lt.f16x2.f16x2 selb, %2, %1;\n"
+      "\n"
+      "\t mul.f16x2 %0, sela, %1;\n"
+      "\t fma.rn.f16x2 %0, selb, %2, %0;\n"
+      "}\n"
+      : "=r"(c)
+      : "r"(a), "r"(b));
+#endif
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hmul2(uint32_t a, uint32_t b) {
+  uint32_t c;
+  asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t bfmul2(uint32_t a, uint32_t b) {
+  uint32_t c;
+  asm("{.reg .b32 c;\n"
+      "  mov.b32 c, 0x80008000U;\n"
+      "  fma.rn.bf16x2 %0,%1,%2,c;}\n"
+      : "=r"(c)
+      : "r"(a), "r"(b));
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint2 hmul4(uint2 a, uint2 b) {
+  uint2 c;
+  c.x = hmul2(a.x, b.x);
+  c.y = hmul2(a.y, b.y);
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint4 hmul8(uint4 a, uint4 b) {
+  uint4 c;
+  c.x = hmul2(a.x, b.x);
+  c.y = hmul2(a.y, b.y);
+  c.z = hmul2(a.z, b.z);
+  c.w = hmul2(a.w, b.w);
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint4 hmul8(uint32_t a, uint4 b) {
+  uint4 c;
+  c.x = hmul2(a, b.x);
+  c.y = hmul2(a, b.y);
+  c.z = hmul2(a, b.z);
+  c.w = hmul2(a, b.w);
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Template function to support both half and bfloat16
+template <typename Data_type>
+inline __device__ uint32_t mul2(uint32_t a, uint32_t b) {
+  return hmul2(a, b);
+}
+
+template <>
+inline __device__ uint32_t mul2<bf16_t>(uint32_t a, uint32_t b) {
+  return bfmul2(a, b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Template function to support both half and bfloat16
+template <typename Data_type>
+inline __device__ uint4 mul8(uint32_t a, uint4 b) {
+  uint4 c;
+  c.x = hmul2(a, b.x);
+  c.y = hmul2(a, b.y);
+  c.z = hmul2(a, b.z);
+  c.w = hmul2(a, b.w);
+  return c;
+}
+
+template <>
+inline __device__ uint4 mul8<bf16_t>(uint32_t a, uint4 b) {
+  uint4 c;
+  c.x = bfmul2(a, b.x);
+  c.y = bfmul2(a, b.y);
+  c.z = bfmul2(a, b.z);
+  c.w = bfmul2(a, b.w);
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hrelu2(uint32_t x) {
+  uint32_t res;
+  uint32_t const zero = 0u;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  asm volatile("max.f16x2 %0, %1, %2;\n" : "=r"(res) : "r"(x), "r"(zero));
+#else
+  asm volatile(
+      "{\n"
+      "\t .reg .f16x2 sela;\n"
+      "\t set.gtu.u32.f16x2 sela, %1, %2;\n"
+      "\t and.b32 %0, sela, %1;\n"
+      "}\n"
+      : "=r"(res)
+      : "r"(x), "r"(zero));
+#endif
+  return res;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t bfrelu2(uint32_t x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  uint32_t res;
+  uint32_t const zero = 0u;
+  asm volatile("max.bf16x2 %0, %1, %2;\n" : "=r"(res) : "r"(x), "r"(zero));
+  return res;
+#endif
+  // not implemented yet
+  return x;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Template function to support both half and bfloat16
+template <typename Data_type>
+inline __device__ uint32_t relu2(uint32_t x) {
+  return hrelu2(x);
+}
+
+template <>
+inline __device__ uint32_t relu2<bf16_t>(uint32_t x) {
+  return bfrelu2(x);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t habs2(uint32_t x) {
+  uint32_t res;
+  asm volatile("abs.f16x2 %0, %1;\n" : "=r"(res) : "r"(x));
+  return res;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// static inline __device__ uint32_t add_bias(uint32_t a, uint32_t bias, bool relu) {
+//     uint32_t c;
+// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+//     if( relu ) {
+//         uint32_t one = 0x3c003c00u;
+//         asm volatile("fma.rn.f16x2.relu %0, %1, %2, %3;" : "=r"(c) : "r"(a), "r"(one),
+//         "r"(bias));
+//     } else {
+//         c = hadd2(a, bias);
+//     }
+// #else
+//     c = hadd2(a, bias);
+//     if( relu ) {
+//         c = hrelu2(c);
+//     }
+// #endif
+//     return c;
+// }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// static inline __device__ uint2 add_bias(uint2 a, uint2 bias, bool relu) {
+//     uint2 dst;
+//     dst.x = add_bias(a.x, bias.x, relu);
+//     dst.y = add_bias(a.y, bias.y, relu);
+//     return dst;
+// }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// static inline __device__ uint4 add_bias(uint4 a, uint4 bias, bool relu) {
+//     uint4 dst;
+//     dst.x = add_bias(a.x, bias.x, relu);
+//     dst.y = add_bias(a.y, bias.y, relu);
+//     dst.z = add_bias(a.z, bias.z, relu);
+//     dst.w = add_bias(a.w, bias.w, relu);
+//     return dst;
+// }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// clamp float +inf/-inf
+static inline __device__ float satfinite(float x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 860
+  // bit representation of maximum value of float
+  uint32_t clamp_value = 0x7f7fffffu;
+  asm volatile("min.xorsign.abs.f32 %0, %0, %1;" : "+f"(x) : "r"(clamp_value));
+  return x;
+#else
+  // bit representation of maximum and minimum value of float
+  uint32_t umax = 0x7f7fffffu;
+  uint32_t umin = 0xff7fffffu;
+  float out;
+  asm volatile("min.f32 %0, %1, %2;" : "=f"(out) : "f"(x), "r"(umax));
+  asm volatile("max.f32 %0, %0, %1;" : "+f"(out) : "r"(umin));
+  return out;
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// clamp half2 +inf/-inf
+static inline __device__ uint32_t satfinite_h2(uint32_t h2) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 860
+  uint32_t out, clamp_value;
+  clamp_value = 0x7bff7bffu;
+  asm volatile("min.xorsign.abs.f16x2 %0, %1, %2;" : "=r"(out) : "r"(h2), "r"(clamp_value));
+  return out;
+#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 800
+  // bit representation of maximum and minimum value of half2
+  uint32_t umax = 0x7bff7bffu;
+  uint32_t umin = 0xfbfffbffu;
+  uint32_t out;
+  asm volatile("min.f16x2 %0, %1, %2;" : "=r"(out) : "r"(h2), "r"(umax));
+  asm volatile("max.f16x2 %0, %0, %1;" : "+r"(out) : "r"(umin));
+  return out;
+#else
+  // Take the absolute value of h2. It should map to |Rx| in SASS.
+  uint32_t p2;
+  asm volatile("abs.f16x2 %0, %1;" : "=r"(p2) : "r"(h2));
+
+  // Compute a mask for each fp16: 0xffff if +INF and 0x0000 otherwise.
+  uint32_t inf2 = 0x7c007c00u;
+  uint32_t mask;
+  asm volatile("set.eq.u32.f16x2 %0, %1, %2;" : "=r"(mask) : "r"(p2), "r"(inf2));
+
+  // Recreate the new value. 0x7bff is the max value for FP16.
+  p2 = (~mask & p2) | (mask & 0x7bff7bff);
+
+  // Simply re-add the sign and we're done.
+  return p2 | (h2 & 0x80008000);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+static inline __device__ T clamp(T x, T lb, T ub) {
+  return x < lb ? lb : (x > ub ? ub : x);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float custom_exp2f(float x, float scale, float scaled_max) {
+  float d1, d2;
+  asm("fma.rz.ftz.f32 %0, %1, %2, %3;" : "=f"(d1) : "f"(x), "f"(scale), "f"(-scaled_max));
+  asm("ex2.approx.ftz.f32 %0, %1;" : "=f"(d2) : "f"(d1));
+  return d2;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint16_t clamp_to_zero(uint16_t x) {
+  uint16_t mask;
+  asm volatile("set.gtu %0, %1, 0;" : "=h"(mask) : "h"(x));
+  return mask & x;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint16_t float_to_half(float f) {
+  uint16_t h;
+  asm volatile("cvt.rn.f16.f32 %0, %1;" : "=h"(h) : "f"(f));
+  return h;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ bf16_t float_to_bf16(float f) { return __float2bfloat16(f); }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t float2_to_half2(float a, float b) {
+  uint32_t c;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(c) : "f"(b), "f"(a));
+#else
+  uint16_t lo = float_to_half(a);
+  uint16_t hi = float_to_half(b);
+  asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(c) : "h"(lo), "h"(hi));
+#endif
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t float2_to_bf16_x2(float a, float b) {
+  uint32_t c;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  asm volatile("cvt.rn.bf16x2.f32 %0, %1, %2;\n" : "=r"(c) : "f"(b), "f"(a));
+#else
+  uint16_t* px = reinterpret_cast<uint16_t*>(&a);
+  uint16_t* py = reinterpret_cast<uint16_t*>(&b);
+  uint16_t value = px[1];
+  uint16_t value2 = py[1];
+
+  if (px[0] == 0x8000) {
+    if ((value & 0x1) == 1) value++;
+  } else if (px[0] > 0x8000) {
+    value++;
+  }
+
+  if (py[0] == 0x8000) {
+    if ((value2 & 0x1) == 1) value2++;
+  } else if (py[0] > 0x8000) {
+    value2++;
+  }
+
+  uint32_t high = reinterpret_cast<uint32_t&>(value2);
+  c = (high << 16) | value;
+#endif
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Template function to support both half and bfloat16
+template <typename Data_type>
+inline __device__ uint32_t float2_to_16bit_2(float a, float b) {
+  return float2_to_half2(a, b);
+}
+
+template <>
+inline __device__ uint32_t float2_to_16bit_2<bf16_t>(float a, float b) {
+  return float2_to_bf16_x2(a, b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t float_to_half2(float a) { return float2_to_half2(a, a); }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t float2_to_half2(float2 const& f) {
+  return float2_to_half2(f.x, f.y);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t float_to_bf16_2(float a) { return float2_to_bf16_x2(a, a); }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint2 float4_to_half4(float x, float y, float z, float w) {
+  uint2 d;
+  d.x = float2_to_half2(x, y);
+  d.y = float2_to_half2(z, w);
+  return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Template function to support both half and bfloat16
+template <typename Data_type>
+inline __device__ uint2 float4_to_16bit_x4(float x, float y, float z, float w) {
+  uint2 d;
+  d.x = float2_to_half2(x, y);
+  d.y = float2_to_half2(z, w);
+  return d;
+}
+
+template <>
+inline __device__ uint2 float4_to_16bit_x4<bf16_t>(float x, float y, float z, float w) {
+  uint2 d;
+  d.x = float2_to_bf16_x2(x, y);
+  d.y = float2_to_bf16_x2(z, w);
+  return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hfma2(uint32_t a, uint32_t b, uint32_t c) {
+  uint32_t d;
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
+  return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hfma2_relu(uint32_t a, uint32_t b, uint32_t c) {
+  uint32_t d;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  asm volatile("fma.rn.f16x2.relu %0, %1, %2, %3;" : "=r"(d) : "r"(a), "r"(b), "r"(c));
+#else
+  d = hrelu2(hfma2(a, b, c));
+#endif
+  return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t h0_h0(uint32_t x) {
+  uint32_t y;
+  asm volatile("{.reg .f16 lo, hi; mov.b32 {lo, hi}, %1; mov.b32 %0, {lo, lo};}\n"
+               : "=r"(y)
+               : "r"(x));
+  return y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float h0_to_float(uint32_t h2) {
+  float f;
+  asm volatile(
+      "{\n"
+      ".reg .f16 lo, hi;\n"
+      "mov.b32 {lo, hi}, %1;\n"
+      "cvt.f32.f16 %0, lo;\n"
+      "}\n"
+      : "=f"(f)
+      : "r"(h2));
+  return f;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t h1_h1(uint32_t x) {
+  uint32_t y;
+  asm volatile("{.reg .f16 lo, hi; mov.b32 {lo, hi}, %1; mov.b32 %0, {hi, hi};}\n"
+               : "=r"(y)
+               : "r"(x));
+  return y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint16_t hadd(uint16_t a, uint16_t b) {
+  uint16_t d;
+  asm volatile("add.f16 %0, %1, %2;" : "=h"(d) : "h"(a), "h"(b));
+  return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hadd(uint32_t a, uint32_t b) { return hadd2(a, b); }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint2 hadd4(uint2 a, uint2 b) {
+  uint2 c;
+  c.x = hadd2(a.x, b.x);
+  c.y = hadd2(a.y, b.y);
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint2 hadd(uint2 a, uint2 b) { return hadd4(a, b); }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint4 hadd8(uint4 a, uint4 b) {
+  uint4 c;
+  c.x = hadd2(a.x, b.x);
+  c.y = hadd2(a.y, b.y);
+  c.z = hadd2(a.z, b.z);
+  c.w = hadd2(a.w, b.w);
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Template function to support both half and bfloat16
+template <typename Data_type>
+inline __device__ uint4 add8(uint4 a, uint4 b) {
+  return hadd8(a, b);
+}
+
+template <>
+inline __device__ uint4 add8<bf16_t>(uint4 a, uint4 b) {
+  uint4 c;
+  c.x = bfadd2(a.x, b.x);
+  c.y = bfadd2(a.y, b.y);
+  c.z = bfadd2(a.z, b.z);
+  c.w = bfadd2(a.w, b.w);
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint4 fadd4(uint4 a, uint4 b) {
+  float4 c;
+  c.x = reinterpret_cast<float const&>(a.x) + reinterpret_cast<float const&>(b.x);
+  c.y = reinterpret_cast<float const&>(a.y) + reinterpret_cast<float const&>(b.y);
+  c.z = reinterpret_cast<float const&>(a.z) + reinterpret_cast<float const&>(b.z);
+  c.w = reinterpret_cast<float const&>(a.w) + reinterpret_cast<float const&>(b.w);
+  return reinterpret_cast<uint4 const&>(c);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint4 hadd(uint4 a, uint4 b) { return hadd8(a, b); }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float half_to_float(uint16_t h) {
+  float f;
+  asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
+  return f;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float bf16_to_float(uint16_t h) {
+  float f;
+  asm volatile("mov.b32 %0, {0, %1};\n" : "=f"(f) : "h"(h));
+  return f;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float2 half2_to_float2(uint32_t x) {
+  uint16_t lo, hi;
+  asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(x));
+  return make_float2(half_to_float(lo), half_to_float(hi));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float2 bf16_2_to_float2(uint32_t x) {
+  float2 res;
+  asm volatile(
+      "{\n"
+      "    .reg .b16 lo, hi;\n"
+      "    mov.b32 {lo, hi}, %2;\n"
+      "    mov.b32 %0, {0, lo};\n"
+      "    mov.b32 %1, {0, hi};\n"
+      "}\n"
+      : "=f"(res.x), "=f"(res.y)
+      : "r"(x));
+  return res;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Template function to support both half and bfloat16
+template <typename Data_type>
+inline __device__ float2 convert_from_16bit_2(uint32_t x) {
+  return half2_to_float2(x);
+}
+
+template <>
+inline __device__ float2 convert_from_16bit_2<bf16_t>(uint32_t x) {
+  return bf16_2_to_float2(x);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ void half2_to_float2(float& x, float& y, uint32_t h) {
+  float2 tmp = half2_to_float2(h);
+  x = tmp.x;
+  y = tmp.y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint16_t hfma(uint16_t a, uint16_t b, uint16_t c) {
+  uint16_t d;
+  asm volatile("fma.rn.f16 %0, %1, %2, %3;" : "=h"(d) : "h"(a), "h"(b), "h"(c));
+  return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint16_t hmul(uint16_t a, uint16_t b) {
+  uint16_t d;
+  asm volatile("mul.f16 %0, %1, %2;" : "=h"(d) : "h"(a), "h"(b));
+  return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Converted two half2's or bf162's into float, then take their dot product.
+template <typename Data_type>
+inline __device__ float fma2_in_float(uint32_t const a, uint32_t const b) {
+  float2 af = fmha::convert_from_16bit_2<Data_type>(a);
+  float2 bf = fmha::convert_from_16bit_2<Data_type>(b);
+  return af.x * bf.x + af.y * bf.y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Converted two vectors of 8 half's or bf16's into float, then take their dot product.
+template <typename Data_type>
+inline __device__ float fma8_in_float(uint4 const a, uint4 const b) {
+  float sum;
+  sum = fmha::fma2_in_float<Data_type>(a.x, b.x);
+  sum += fmha::fma2_in_float<Data_type>(a.y, b.y);
+  sum += fmha::fma2_in_float<Data_type>(a.z, b.z);
+  sum += fmha::fma2_in_float<Data_type>(a.w, b.w);
+  return sum;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float sigmoid(float x) { return 1.f / (1.f + expf(-x)); }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void clear(uint16_t& dst) { dst = uint16_t(0); }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void clear(uint32_t& dst) { dst = 0u; }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void clear(uint2& dst) { dst = make_uint2(0u, 0u); }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void clear(uint4& dst) { dst = make_uint4(0u, 0u, 0u, 0u); }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// P R E D I C A T E   P A C K I N G
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum { BYTES_PER_REG = 4, PREDS_PER_BYTE = 4, PREDS_PER_REG = BYTES_PER_REG * PREDS_PER_BYTE };
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int LDGS>
+struct Compute_number_of_pred_regs {
+  enum { VALUE = Div_up<LDGS, PREDS_PER_REG>::VALUE };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int M, int N>
+inline __device__ void pack_predicates(uint32_t (&preds)[M], uint32_t const (&p)[N]) {
+  // Make sure the values match.
+  static_assert(Compute_number_of_pred_regs<N>::VALUE == M, "");
+
+  // The number of complete steps (where we use all the predicates in a byte).
+  enum { COMPLETE_BYTES = N / PREDS_PER_BYTE };
+
+  // Make sure we allocated enough predicate registers.
+  static_assert(Div_up<COMPLETE_BYTES, BYTES_PER_REG>::VALUE <= M, "");
+
+  // The remainder.
+  enum { REMAINDER = N - COMPLETE_BYTES * PREDS_PER_BYTE };
+
+  // Make sure we got the math right and the remainder is between 0 and 3.
+  static_assert(REMAINDER >= 0 && REMAINDER <= 3, "");
+
+  // The mask to extract the predicates.
+  enum { COMPLETE_MASK = (1 << PREDS_PER_BYTE) - 1 };
+
+  // Run complete steps.
+#pragma unroll
+  for (int ii = 0; ii < M; ++ii) {
+    // The number of complete bytes for that register. Be careful it can be > than 4 ;)
+    int const COMPLETE = (N - ii * PREDS_PER_REG) / PREDS_PER_BYTE;
+
+    // Pack the predicates in a register.
+    uint32_t reg = 0u;
+#pragma unroll
+    for (int jj = 0; jj < 4; ++jj) {
+      // Early exit.
+      if (jj >= COMPLETE) {
+        break;
+      }
+
+      // Prepare the array of predicates.
+      bool tmp[PREDS_PER_BYTE];
+#pragma unroll
+      for (int kk = 0; kk < PREDS_PER_BYTE; ++kk) {
+        tmp[kk] = p[ii * PREDS_PER_REG + jj * PREDS_PER_BYTE + kk] != 0;
+      }
+
+      // Store the predicates.
+#pragma unroll
+      for (int kk = 0; kk < PREDS_PER_BYTE; ++kk) {
+        if (tmp[kk]) {
+          reg |= 1u << (jj * 8 + kk);
+        }
+      }
+    }
+
+    // Skip the rest of the code if we do not have a remainder.
+    if (COMPLETE < 4 && REMAINDER > 0) {
+      // The mask to extract the predicates.
+      enum { REMAINDER_MASK = (1 << REMAINDER) - 1 };
+
+      // Prepare the array of predicates.
+      bool tmp[PREDS_PER_BYTE];
+#pragma unroll
+      for (int jj = 0; jj < REMAINDER; ++jj) {
+        tmp[jj] = p[COMPLETE_BYTES * PREDS_PER_BYTE + jj] != 0;
+      }
+
+      // Store the predicates.
+#pragma unroll
+      for (int jj = 0; jj < REMAINDER; ++jj) {
+        if (tmp[jj]) {
+          reg |= 1u << (COMPLETE * 8 + jj);
+        }
+      }
+    }
+
+    // Store the predicate register.
+    preds[ii] = reg;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+inline __device__ uint32_t pack_predicates(uint32_t const (&p)[N]) {
+  uint32_t tmp[1];
+  pack_predicates(tmp, p);
+  return tmp[0];
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// G E N E R I C   P R E D I C A T E D   L D G S T S
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, int M, typename Functor>
+inline __device__ void ldgsts_(Functor& fct, uint32_t const (&preds)[M]) {
+  // The number of complete bytes (where we use all the predicates in a byte).
+  enum { COMPLETE = N / PREDS_PER_BYTE };
+
+  // Make sure we did allocate enough predicates.
+  static_assert(Div_up<COMPLETE, BYTES_PER_REG>::VALUE <= M, "");
+
+  // The remainder.
+  enum { REMAINDER = N - COMPLETE * PREDS_PER_BYTE };
+
+  // Make sure we got the math right and the remainder is between 0 and 3.
+  static_assert(REMAINDER >= 0 && REMAINDER <= 3, "");
+
+  // The mask to extract the predicates.
+  enum { COMPLETE_MASK = (1 << PREDS_PER_BYTE) - 1 };
+
+// Clear the fetch registers.
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    fct.clear(ii);
+  }
+
+  // Run complete steps.
+  bool p[PREDS_PER_BYTE];
+#pragma unroll
+  for (int ii = 0; ii < COMPLETE; ++ii) {
+    // The predicate.
+    uint32_t reg = preds[ii / BYTES_PER_REG];
+
+    // Extract the predicates.
+#pragma unroll
+    for (int jj = 0; jj < PREDS_PER_BYTE; ++jj) {
+      uint32_t mask = 1u << (ii % BYTES_PER_REG * 8 + jj);
+      p[jj] = (reg & mask) != 0u;
+    }
+
+// Issue the loads.
+#pragma unroll
+    for (int jj = 0; jj < PREDS_PER_BYTE; ++jj) {
+      fct.ldgsts(ii * PREDS_PER_BYTE + jj, p[jj]);
+    }
+  }
+
+  // Skip the rest of the code if we do not have a remainder.
+  if (REMAINDER > 0) {
+    // The mask to extract the predicates.
+    enum { REMAINDER_MASK = (1 << REMAINDER) - 1 };
+
+    // The predicate register.
+    uint32_t reg = preds[COMPLETE / BYTES_PER_REG];
+
+    // Extract the predicates.
+#pragma unroll
+    for (int jj = 0; jj < PREDS_PER_BYTE; ++jj) {
+      uint32_t mask = 1u << (COMPLETE % BYTES_PER_REG * 8 + jj);
+      p[jj] = (reg & mask) != 0u;
+    }
+
+// Issue the loads.
+#pragma unroll
+    for (int ii = 0; ii < REMAINDER; ++ii) {
+      fct.ldgsts(COMPLETE * PREDS_PER_BYTE + ii, p[ii]);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int M, typename Functor>
+inline __device__ void ldgsts_(Functor& fct, uint32_t preds) {
+  uint32_t tmp[1] = {preds};
+  ldgsts_<M>(fct, tmp);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// L D G
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldg(uint8_t& dst, void const* ptr) {
+  dst = *reinterpret_cast<uint8_t const*>(ptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldg(uint16_t& dst, void const* ptr) {
+  dst = *reinterpret_cast<uint16_t const*>(ptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldg(uint32_t& dst, void const* ptr) {
+  dst = *reinterpret_cast<uint32_t const*>(ptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldg(uint2& dst, void const* ptr) {
+  dst = *reinterpret_cast<uint2 const*>(ptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldg(uint4& dst, void const* ptr) {
+  dst = *reinterpret_cast<uint4 const*>(ptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Data_type, int N>
+struct Ldg_functor {
+  // Ctor.
+  inline __device__ Ldg_functor(Data_type (&fetch)[N], void const* (&ptrs)[N])
+      : fetch_(fetch), ptrs_(ptrs) {}
+
+  // Clear the element.
+  inline __device__ void clear(int ii) { fmha::clear(fetch_[ii]); }
+
+  // Trigger the loads.
+  inline __device__ void ldgsts(int ii, bool p) {
+    if (p) {
+      ldg(fetch_[ii], ptrs_[ii]);
+    }
+  }
+
+  // The fetch registers.
+  Data_type (&fetch_)[N];
+  // The pointers.
+  void const* (&ptrs_)[N];
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Data_type, int N, int M>
+inline __device__ void ldg_(Data_type (&fetch)[N], void const* (&ptrs)[N], uint32_t (&preds)[M]) {
+  Ldg_functor<Data_type, N> fct(fetch, ptrs);
+  ldgsts_<N>(fct, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, int M>
+inline __device__ void ldg(uint8_t (&fetch)[N], void const* (&ptrs)[N], uint32_t (&preds)[M]) {
+  ldg_<uint8_t, N>(fetch, ptrs, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, int M>
+inline __device__ void ldg(uint16_t (&fetch)[N], void const* (&ptrs)[N], uint32_t (&preds)[M]) {
+  ldg_<uint16_t, N>(fetch, ptrs, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, int M>
+inline __device__ void ldg(uint32_t (&fetch)[N], void const* (&ptrs)[N], uint32_t (&preds)[M]) {
+  ldg_<uint32_t, N>(fetch, ptrs, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, int M>
+inline __device__ void ldg(uint2 (&fetch)[N], void const* (&ptrs)[N], uint32_t (&preds)[M]) {
+  ldg_<uint2, N>(fetch, ptrs, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, int M>
+inline __device__ void ldg(uint4 (&fetch)[N], void const* (&ptrs)[N], uint32_t (&preds)[M]) {
+  ldg_<uint4, N>(fetch, ptrs, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool USE_LDGSTS>
+inline __device__ void ldgdepbar() {
+  if (USE_LDGSTS) {
+    asm volatile("cp.async.commit_group;\n" ::);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool USE_LDGSTS, int COUNT = 0>
+inline __device__ void depbar_() {
+  if (USE_LDGSTS) {
+    asm volatile("cp.async.wait_group %0;\n" ::"n"(COUNT));
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool USE_LDGSTS, int STAGES>
+inline __device__ void depbar() {
+  if (USE_LDGSTS) {
+    int const VALUE = Max<STAGES - 2, 0>::VALUE;
+    asm volatile("cp.async.wait_group %0;\n" ::"n"(VALUE));
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldgsts128(uint32_t dst, void const* src, bool p = true) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  uint32_t m = p ? 16u : 0u;
+  asm volatile("cp.async.cg.shared.global [%0], [%1], 16, %2;\n" ::"r"(dst), "l"(src), "r"(m));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct Ldgsts_functor {
+  // Ctor.
+  inline __device__ Ldgsts_functor(uint32_t (&smem_ptrs)[N], void const* (&gmem_ptrs)[N])
+      : smem_ptrs_(smem_ptrs), gmem_ptrs_(gmem_ptrs) {}
+
+  // Does nothing.
+  inline __device__ void clear(int ii) {}
+
+  // Trigger the load-store instruction.
+  inline __device__ void ldgsts(int ii, bool p) { ldgsts128(smem_ptrs_[ii], gmem_ptrs_[ii], p); }
+
+  // The shared memory pointers.
+  uint32_t (&smem_ptrs_)[N];
+  // The global memory pointers.
+  void const* (&gmem_ptrs_)[N];
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, int M>
+inline __device__ void ldgsts(uint32_t (&dst)[N], void const* (&src)[N], uint32_t (&preds)[M]) {
+  Ldgsts_functor<N> fct(dst, src);
+  ldgsts_<N>(fct, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// L D S
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void lds(uint16_t& dst, uint32_t ptr) {
+  asm volatile("ld.shared.b16 %0, [%1];\n" : "=h"(dst) : "r"(ptr));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void lds(uint32_t& dst, uint32_t ptr) {
+  asm volatile("ld.shared.b32 %0, [%1];\n" : "=r"(dst) : "r"(ptr));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void lds(uint2& dst, uint32_t ptr) {
+  asm volatile("ld.shared.v2.b32 {%0, %1}, [%2];\n" : "=r"(dst.x), "=r"(dst.y) : "r"(ptr));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void lds(uint4& dst, uint32_t ptr) {
+  asm volatile("ld.shared.v4.b32 {%0, %1, %2, %3}, [%4];\n"
+               : "=r"(dst.x), "=r"(dst.y), "=r"(dst.z), "=r"(dst.w)
+               : "r"(ptr));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// L D S M
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsm(uint32_t& dst, uint32_t ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
+  asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n" : "=r"(dst) : "r"(ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsmt(uint32_t& dst, uint32_t ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
+  asm volatile("ldmatrix.sync.aligned.m8n8.x1.trans.shared.b16 {%0}, [%1];\n"
+               : "=r"(dst)
+               : "r"(ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsm(uint2& dst, uint32_t ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
+  asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0, %1}, [%2];\n"
+               : "=r"(dst.x), "=r"(dst.y)
+               : "r"(ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsmt(uint2& dst, uint32_t ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
+  asm volatile("ldmatrix.sync.aligned.m8n8.x2.trans.shared.b16 {%0, %1}, [%2];\n"
+               : "=r"(dst.x), "=r"(dst.y)
+               : "r"(ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsm(uint4& dst, uint32_t ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
+  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+               : "=r"(dst.x), "=r"(dst.y), "=r"(dst.z), "=r"(dst.w)
+               : "r"(ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsmt(uint4& dst, uint32_t ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
+  asm volatile("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+               : "=r"(dst.x), "=r"(dst.y), "=r"(dst.z), "=r"(dst.w)
+               : "r"(ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// S T S M
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stsm(uint32_t ptr, uint32_t const& src) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  asm volatile("stmatrix.sync.aligned.m8n8.x1.shared.b16 [%0], {%1};\n" ::"r"(ptr), "r"(src));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stsmt(uint32_t ptr, uint32_t const& src) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  asm volatile("stmatrix.sync.aligned.m8n8.x1.trans.shared.b16 [%0], {%1};\n" ::"r"(ptr), "r"(src));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stsm(uint32_t ptr, uint2 const& src) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  asm volatile("stmatrix.sync.aligned.m8n8.x2.shared.b16 [%0], {%1, %2};\n" ::"r"(ptr), "r"(src.x),
+               "r"(src.y));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stsmt(uint32_t ptr, uint2 const& src) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  asm volatile("stmatrix.sync.aligned.m8n8.x2.trans.shared.b16 [%0], {%1, %2};\n" ::"r"(ptr),
+               "r"(src.x), "r"(src.y));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stsm(uint32_t ptr, uint4 const& src) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  asm volatile("stmatrix.sync.aligned.m8n8.x4.shared.b16 [%0], {%1, %2, %3, %4};\n" ::"r"(ptr),
+               "r"(src.x), "r"(src.y), "r"(src.z), "r"(src.w));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stsmt(uint32_t ptr, uint4 const& src) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  asm volatile(
+      "stmatrix.sync.aligned.m8n8.x4.trans.shared.b16 [%0], {%1, %2, %3, %4};\n" ::"r"(ptr),
+      "r"(src.x), "r"(src.y), "r"(src.z), "r"(src.w));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// S T G
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stg(void* ptr, float val) { *reinterpret_cast<float*>(ptr) = val; }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stg(void* ptr, uint8_t val) { *reinterpret_cast<uint8_t*>(ptr) = val; }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stg(void* ptr, uint16_t val) { *reinterpret_cast<uint16_t*>(ptr) = val; }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stg(void* ptr, uint32_t val) { *reinterpret_cast<uint32_t*>(ptr) = val; }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stg(void* ptr, uint2 val) { *reinterpret_cast<uint2*>(ptr) = val; }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stg(void* ptr, uint4 val) { *reinterpret_cast<uint4*>(ptr) = val; }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// S T S
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void sts(uint32_t ptr, uint16_t val) {
+  asm volatile("st.shared.b16 [%0], %1;\n" : : "r"(ptr), "h"(val));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void sts(uint32_t ptr, uint32_t val) {
+  asm volatile("st.shared.b32 [%0], %1;\n" : : "r"(ptr), "r"(val));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void sts(uint32_t ptr, uint2 val) {
+  asm volatile("st.shared.v2.b32 [%0], {%1, %2};\n" : : "r"(ptr), "r"(val.x), "r"(val.y));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void sts(uint32_t ptr, uint4 val) {
+  asm volatile("st.shared.v4.b32 [%0], {%1, %2, %3, %4};\n"
+               :
+               : "r"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Data_type, int N>
+inline __device__ void sts_(uint32_t (&ptrs)[N], Data_type const (&data)[N]) {
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    sts(ptrs[ii], data[ii]);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+inline __device__ void sts(uint32_t (&ptrs)[N], uint16_t const (&data)[N]) {
+  sts_<uint16_t, N>(ptrs, data);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+inline __device__ void sts(uint32_t (&ptrs)[N], uint32_t const (&data)[N]) {
+  sts_<uint32_t, N>(ptrs, data);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+inline __device__ void sts(uint32_t (&ptrs)[N], uint2 const (&data)[N]) {
+  sts_<uint2, N>(ptrs, data);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+inline __device__ void sts(uint32_t (&ptrs)[N], uint4 const (&data)[N]) {
+  sts_<uint4, N>(ptrs, data);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int*>(&(var)))
+#define __HALF2_TO_CUI(var) *(reinterpret_cast<const unsigned int*>(&(var)))
+
+static __device__ __inline__ void atomicAdd_half2(half2* const address, const half2 val) {
+  asm volatile("{ red.global.add.noftz.f16x2 [%0],%1; }\n" ::"l"(address), "r"(__HALF2_TO_CUI(val))
+               : "memory");
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool CAN_BE_NEGATIVE>
+static inline __device__ uint32_t float4_to_char4(float x, float y, float z, float w) {
+#if defined(USE_F2I_EMULATION_TRICK)
+  // Make sure the float is in the proper range.
+  float cx, cy, cz, cw;
+  if (CAN_BE_NEGATIVE) {
+    cx = fmha::clamp(x, -128.f, 127.f);
+    cy = fmha::clamp(y, -128.f, 127.f);
+    cz = fmha::clamp(z, -128.f, 127.f);
+    cw = fmha::clamp(w, -128.f, 127.f);
+  } else {
+    cx = fminf(x, 127.f);
+    cy = fminf(y, 127.f);
+    cz = fminf(z, 127.f);
+    cw = fminf(w, 127.f);
+  }
+
+  // Re-add the magic number.
+  cx += FP32_I2F_MAGIC_NUMBER;
+  cy += FP32_I2F_MAGIC_NUMBER;
+  cz += FP32_I2F_MAGIC_NUMBER;
+  cw += FP32_I2F_MAGIC_NUMBER;
+
+  // We need unsigned ints...
+  uint32_t a = reinterpret_cast<uint32_t const&>(cx);
+  uint32_t b = reinterpret_cast<uint32_t const&>(cy);
+  uint32_t c = reinterpret_cast<uint32_t const&>(cz);
+  uint32_t d = reinterpret_cast<uint32_t const&>(cw);
+
+  // Pack the numbers.
+  uint32_t dst;
+  asm volatile("prmt.b32 %0, %1, %2, 0x0040;\n" : "=r"(dst) : "r"(a), "r"(b));
+  asm volatile("prmt.b32 %0, %0, %1, 0x0410;\n" : "+r"(dst) : "r"(c));
+  asm volatile("prmt.b32 %0, %0, %1, 0x4210;\n" : "+r"(dst) : "r"(d));
+  return dst;
+#else
+  uint32_t a;
+  asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(a) : "f"(x));
+  uint32_t b;
+  asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(b) : "f"(y));
+  uint32_t c;
+  asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(c) : "f"(z));
+  uint32_t d;
+  asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(d) : "f"(w));
+
+  uint32_t dst;
+  asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2,  0;\n" : "=r"(dst) : "r"(d), "r"(c));
+  asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2, %0;\n" : "+r"(dst) : "r"(b), "r"(a));
+  return dst;
+#endif  // defined(USE_F2I_EMULATION_TRICK)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ void swizzle_rows(uint32_t& a, uint32_t& b, uint32_t c, uint32_t d) {
+  asm volatile("prmt.b32 %0, %1, %2, 0x6420;\n" : "=r"(a) : "r"(c), "r"(d));
+  asm volatile("prmt.b32 %0, %1, %2, 0x7531;\n" : "=r"(b) : "r"(c), "r"(d));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsm_with_lds(uint2& data, uint32_t smem) {
+  int lane = threadIdx.x % 32;
+  data = {0, 0};
+  uint4 v = {0, 0, 0, 0};
+  uint32_t* a = reinterpret_cast<uint32_t*>(&v);
+  if (lane < 16) {
+    fmha::lds(v, smem);
+  }
+  int src_row = lane / 4;
+  int src_col = lane % 4;
+  for (int it = 0; it < 4; it++) {
+    uint32_t val = a[it];
+    uint32_t x = __shfl_sync(uint32_t(-1), val, src_row);
+    __syncwarp();
+    uint32_t y = __shfl_sync(uint32_t(-1), val, src_row + 8);
+    __syncwarp();
+    if (it == src_col) {
+      data.x = x;
+      data.y = y;
+    }
+  }
+}
+
+inline __device__ void ldsmt_with_lds(uint2& data, uint32_t smem) {
+  int lane = threadIdx.x % 32;
+
+  uint4 tmp16{0, 0, 0, 0};  // 16B
+
+  if (lane < 16) {
+    fmha::lds(tmp16, smem);
+  }
+
+  uint16_t* tmp16c = reinterpret_cast<uint16_t*>(&tmp16);  // 8x2B: we move pairs
+
+  uint16_t* t = reinterpret_cast<uint16_t*>(&data);  // 4x2B
+
+  int const src_col = lane / 4;  // 0 - 7
+  int const src_row = (lane % 4) * 2;
+
+// we have to shuffle the values to distribute them in the warp
+#pragma unroll
+  for (int it = 0; it < 8; it++) {
+    uint16_t val, x, y;
+    val = tmp16c[it];
+    x = __shfl_sync(uint32_t(-1), val, src_row + 0);
+    __syncwarp();
+    y = __shfl_sync(uint32_t(-1), val, src_row + 1);
+    __syncwarp();
+
+    if (src_col == it) {
+      t[0] = x;
+      t[1] = y;
+    }
+    val = tmp16c[it];
+    x = __shfl_sync(uint32_t(-1), val, src_row + 8);
+    __syncwarp();
+    y = __shfl_sync(uint32_t(-1), val, src_row + 9);
+    __syncwarp();
+
+    if (src_col == it) {
+      t[2] = x;
+      t[3] = y;
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct MaxOp {
+  __device__ inline T operator()(T const& x, T const& y) { return x > y ? x : y; }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct SumOp {
+  __device__ inline T operator()(T const& x, T const& y) { return x + y; }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int THREADS>
+struct Allreduce {
+  static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
+
+  template <typename T, typename Operator>
+  static __device__ inline T run(T x, Operator& op) {
+    constexpr int OFFSET = THREADS / 2;
+    x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET));
+    return Allreduce<OFFSET>::run(x, op);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Allreduce<2> {
+  template <typename T, typename Operator>
+  static __device__ inline T run(T x, Operator& op) {
+    x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1));
+    return x;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator, int M>
+__device__ inline void quad_reduce(float (&dst)[M], float (&src)[M], Operator& op) {
+#pragma unroll
+  for (int mi = 0; mi < M; mi++) {
+    dst[mi] = src[mi];
+    dst[mi] = op(dst[mi], __shfl_down_sync(uint32_t(-1), dst[mi], 2));
+    dst[mi] = op(dst[mi], __shfl_down_sync(uint32_t(-1), dst[mi], 1));
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator, int M>
+__device__ inline void quad_reduce(float (&dst)[M], float2 (&src)[M], Operator& op) {
+  float tmp[M];
+#pragma unroll
+  for (int mi = 0; mi < M; mi++) {
+    tmp[mi] = op(src[mi].x, src[mi].y);
+  }
+  quad_reduce(dst, tmp, op);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator, int M>
+__device__ inline void quad_allreduce(float (&dst)[M], float (&src)[M], Operator& op) {
+#pragma unroll
+  for (int mi = 0; mi < M; mi++) {
+    dst[mi] = src[mi];
+    dst[mi] = Allreduce<4>::run(dst[mi], op);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator, int M>
+__device__ inline void quad_allreduce(float (&dst)[M], float2 (&src)[M], Operator& op) {
+  float tmp[M];
+#pragma unroll
+  for (int mi = 0; mi < M; mi++) {
+    tmp[mi] = op(src[mi].x, src[mi].y);
+  }
+  quad_allreduce(dst, tmp, op);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint32_t elect_one_sync() {
+  uint32_t pred = 0;
+#if __CUDA_ARCH__ >= 900
+#if !defined(__CUDACC_RTC__)
+  uint32_t laneid = 0;
+  asm volatile(
+      "\n\
+    {\n\
+        .reg .b32 %rx;\n\
+        .reg .pred %px;\n\
+        elect.one.sync %rx|%px, %2;\n\
+        @%px mov.s32 %1, 1;\n\
+        mov.s32 %0, %rx;\n\
+    }\n"
+      : "+r"(laneid), "+r"(pred)
+      : "r"(0xFFFFFFFF));
+#else
+  pred = threadIdx.x == 0;
+#endif
+#endif
+  return pred;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint16_t float2_to_e4m3x2(float x, float y) {
+#if defined(__CUDA_ARCH__) && \
+    ((__CUDA_ARCH__ == 890 && defined(FMHA_ENABLE_SM89_QMMA)) || (__CUDA_ARCH__ >= 900))
+  uint16_t res;
+  asm volatile("cvt.rn.e4m3x2.f32.satfinite %0, %2, %1;" : "=h"(res) : "f"(x), "f"(y));
+  return res;
+#else
+  assert(false);
+  return 0;
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint32_t float4_to_e4m3x4(float x, float y, float z, float w) {
+#if defined(__CUDA_ARCH__) && \
+    ((__CUDA_ARCH__ == 890 && defined(FMHA_ENABLE_SM89_QMMA)) || (__CUDA_ARCH__ >= 900))
+  uint32_t res;
+  asm volatile(
+      "{\n"
+      ".reg .b16 lo;\n"
+      ".reg .b16 hi;\n"
+      "cvt.rn.e4m3x2.f32.satfinite   lo, %2, %1;\n"
+      "cvt.rn.e4m3x2.f32.satfinite   hi, %4, %3;\n"
+      "mov.b32 %0, {lo, hi};\n"
+      "}"
+      : "=r"(res)
+      : "f"(x), "f"(y), "f"(z), "f"(w));
+  return res;
+#else
+  assert(false);
+  return 0;
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint32_t float4_to_e5m2x4(float x, float y, float z, float w) {
+#if defined(__CUDA_ARCH__) && \
+    ((__CUDA_ARCH__ == 890 && defined(FMHA_ENABLE_SM89_QMMA)) || (__CUDA_ARCH__ >= 900))
+  uint32_t res;
+  asm volatile(
+      "{\n"
+      ".reg .b16 lo;\n"
+      ".reg .b16 hi;\n"
+      "cvt.rn.e5m2x2.f32.satfinite   lo, %2, %1;\n"
+      "cvt.rn.e5m2x2.f32.satfinite   hi, %4, %3;\n"
+      "mov.b32 %0, {lo, hi};\n"
+      "}"
+      : "=r"(res)
+      : "f"(x), "f"(y), "f"(z), "f"(w));
+  return res;
+#else
+  assert(false);
+  return 0;
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint32_t half4_to_e4m3x4(uint32_t const h2_0, uint32_t const h2_1) {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 890))
+  uint32_t res;
+  asm volatile(
+      "{\n"
+      ".reg .b16 lo, hi;\n"
+      "cvt.satfinite.rn.e4m3x2.f16x2 lo, %1;\n"
+      "cvt.satfinite.rn.e4m3x2.f16x2 hi, %2;\n"
+      "mov.b32 %0, {lo, hi};\n"
+      "}\n"
+      : "=r"(res)
+      : "r"(h2_0), "r"(h2_1));
+  return res;
+#else
+  assert(false);
+  return 0;
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint32_t half4_to_e5m2x4(uint32_t const h2_0, uint32_t const h2_1) {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 890))
+  uint32_t res;
+  asm volatile(
+      "{\n"
+      ".reg .b16 lo, hi;\n"
+      "cvt.satfinite.rn.e5m2x2.f16x2 lo, %1;\n"
+      "cvt.satfinite.rn.e5m2x2.f16x2 hi, %2;\n"
+      "mov.b32 %0, {lo, hi};\n"
+      "}\n"
+      : "=r"(res)
+      : "r"(h2_0), "r"(h2_1));
+  return res;
+#else
+  assert(false);
+  return 0;
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Helpers to pack float4 into a destination register with 4 8bit values
+template <typename Dst_type>
+inline __device__ uint32_t float4_to_8bitx4(float const x, float const y, float const z,
+                                            float const w) {
+  return float4_to_char4<false>(x, y, z, w);
+};
+
+template <>
+inline __device__ uint32_t float4_to_8bitx4<e4m3_t>(float const x, float const y, float const z,
+                                                    float const w) {
+  return float4_to_e4m3x4(x, y, z, w);
+};
+
+template <>
+inline __device__ uint32_t float4_to_8bitx4<e5m2_t>(float const x, float const y, float const z,
+                                                    float const w) {
+  return float4_to_e5m2x4(x, y, z, w);
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+inline __device__ uint32_t half4_to_fp8x4(uint32_t const h2_0, uint32_t const h2_1);
+
+template <>
+inline __device__ uint32_t half4_to_fp8x4<fmha::e4m3_t>(uint32_t const h2_0, uint32_t const h2_1) {
+  return half4_to_e4m3x4(h2_0, h2_1);
+}
+
+template <>
+inline __device__ uint32_t half4_to_fp8x4<fmha::e5m2_t>(uint32_t const h2_0, uint32_t const h2_1) {
+  return half4_to_e5m2x4(h2_0, h2_1);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+inline __device__ uint32_t float4_to_fp8x4(float const, float const, float const, float const);
+
+template <>
+inline __device__ uint32_t float4_to_fp8x4<fmha::e4m3_t>(float const x, float const y,
+                                                         float const z, float const w) {
+  return float4_to_e4m3x4(x, y, z, w);
+}
+
+template <>
+inline __device__ uint32_t float4_to_fp8x4<fmha::e5m2_t>(float const x, float const y,
+                                                         float const z, float const w) {
+  return float4_to_e5m2x4(x, y, z, w);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void fence_view_async_shared() {
+  // Issue a shared memory fence for async operations (FENCE.VIEW.ASYNC.S)
+  // only compiles on sm90+
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  asm volatile("fence.proxy.async.shared::cta;\n");
+#else
+  assert(false);
+#endif
+}
+
+inline __device__ void fence_view_async_global() {
+  // Issue a global memory fence for async operations (FENCE.VIEW.ASYNC.G)
+  // only compiles on sm90+
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  asm volatile("fence.proxy.async.global::cta;\n");
+#else
+  assert(false);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ char* align_1024(char* ptr) {
+  uint64_t address_bit = reinterpret_cast<uint64_t>(ptr);
+  uint64_t offset = address_bit % 1024;
+  if (offset == 0) {
+    return ptr;
+  } else {
+    return ptr + (1024 - offset);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float atomicMaxFloat(float* addr, float value) {
+  float old;
+  old = (value >= 0) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
+                     : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+  return old;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float atomicMaxFloatPos_(float* addr, float value) {
+  // VALUE MUST BE POSITIVE! USED ONLY FOR INTERNAL AMAX REDUCTION.
+  float old = __int_as_float(atomicMax((int*)addr, __float_as_int(value)));
+  return old;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float max3Pos_(float const a, float const b, float const c) {
+  // VALUE MUST BE POSITIVE! USED ONLY FOR INTERNAL AMAX REDUCTION.
+  float res;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  int32_t a_ = reinterpret_cast<int32_t const&>(a);
+  int32_t b_ = reinterpret_cast<int32_t const&>(b);
+  int32_t c_ = reinterpret_cast<int32_t const&>(c);
+  int32_t tmp;
+  asm volatile("max.s16x2 %0, %1, %2;\n" : "=r"(tmp) : "r"(a_), "r"(b_));
+  asm volatile("max.s16x2 %0, %0, %1;\n" : "+r"(tmp) : "r"(tmp), "r"(c_));
+  res = reinterpret_cast<float const&>(tmp);
+#else
+  res = fmaxf(a, fmaxf(b, c));
+#endif
+  return res;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Fast approximate tanh.
+static inline __device__ float __tanhf(float x) {
+#if (__CUDA_ARCH__ >= 750)
+  float r = x;
+  asm("tanh.approx.f32 %0, %0;" : "+f"(r));
+  return r;
+#else
+  return tanhf(x);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/warpspec/circular_buffer.h b/csrc/fmha_v2/fmha/warpspec/circular_buffer.h
new file mode 100644
index 0000000000..903319490a
--- /dev/null
+++ b/csrc/fmha_v2/fmha/warpspec/circular_buffer.h
@@ -0,0 +1,399 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <fmha/hopper/arrive_wait.h>
+#include <fmha/utils.h>
+#include <stdint.h>
+
+#pragma once
+
+namespace fmha {
+namespace ws {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/* Shared storage for barriers needed by both producer and consumer */
+template <int DEPTH>
+struct CircularBufferBarriers {
+  __align__(8) uint64_t entryProducedBarriers[DEPTH];
+  __align__(8) uint64_t entryConsumedBarriers[DEPTH];
+
+  CircularBufferBarriers() = default;
+  // CircularBufferBarriers must live in __shared__ -- cannot copy
+  CircularBufferBarriers(CircularBufferBarriers const& other) = delete;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/* Producer class */
+template <int DEPTH, int CGA_SIZE>
+class CircularBufferWriter {
+ protected:
+  uint32_t _wptr;
+  uint32_t _phase;
+  fmha::Arrive_wait _entryConsumedBarriers;
+  fmha::Arrive_wait _entryProducedBarriers;
+
+ public:
+  inline __device__ CircularBufferWriter(CircularBufferBarriers<DEPTH>* barriers)
+      : _entryProducedBarriers(barriers->entryProducedBarriers),
+        _entryConsumedBarriers(barriers->entryConsumedBarriers),
+        _wptr(0),
+        _phase(0xffffffff) {}
+
+  inline __device__ int ptr() { return _wptr; }
+
+  // Return the equivalent read phase.
+  inline __device__ int phase() { return _phase ^ 0xffffffff; }
+
+  /* Reserve space in the buffer for TMA */
+  inline __device__ int tmaReserve(int tid0, int transactioncnt) {
+    int ptr = threadReserve();
+    _entryProducedBarriers.bar_arrive_set_transactioncnt(ptr, transactioncnt, tid0);
+    return ptr;
+  }
+
+  /* Reserve space in the buffer for producer threads */
+  inline __device__ int threadReserve() {
+    wait();
+    return advance();
+  }
+
+  inline __device__ int advance() {
+    int rval = _wptr;
+    _phase ^= (1 << _wptr);
+    _wptr += 1;
+    if (_wptr >= DEPTH) {
+      _wptr = 0;
+    }
+    return rval;
+  }
+
+  /* Wait for space to become available in the buffer */
+  inline __device__ int wait() {
+    int ready = _entryConsumedBarriers.bar_peek(_wptr, (_phase >> _wptr) & 1);
+    if (!ready) _entryConsumedBarriers.bar_wait(_wptr, (_phase >> _wptr) & 1);
+    return _wptr;
+  }
+
+  /* Signal that data is ready */
+  inline __device__ void threadCommit(int tid0, int id) {
+    if (tid0) {
+      _entryProducedBarriers.bar_arrive_normal(id);
+    }
+  }
+
+  /* Get the barrier address, needed by TMA */
+  inline __device__ uint64_t* barrier_ptr(int id) {
+    return _entryProducedBarriers.get_bar_addr(id);
+  }
+
+  inline __device__ void setPtr(int ptr) { _wptr = ptr; }
+
+  inline __device__ void setPhase(int phase) { _phase = phase; }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/* Consumer class */
+template <int DEPTH, int CGA_SIZE>
+class CircularBufferReader {
+ private:
+  uint32_t _rptr;
+  uint32_t _phase;
+
+ public:
+  fmha::Arrive_wait _entryProducedBarriers;
+  fmha::Arrive_wait _entryConsumedBarriers;
+
+  inline __device__ CircularBufferReader(CircularBufferBarriers<DEPTH>* barriers)
+      : _entryProducedBarriers(barriers->entryProducedBarriers),
+        _entryConsumedBarriers(barriers->entryConsumedBarriers),
+        _rptr(0),
+        _phase(0) {}
+
+  inline __device__ void setProducerCta(int cta_id) {
+    _entryConsumedBarriers.set_bar_base_dsmem(cta_id);
+  }
+
+  /* Peek at the head */
+  inline __device__ int peek() {
+    return _entryProducedBarriers.bar_peek(_rptr, (_phase >> _rptr) & 1);
+  }
+
+  /* Wait for the head to be ready */
+  inline __device__ int wait() {
+    _entryProducedBarriers.bar_wait(_rptr, (_phase >> _rptr) & 1);
+    return _rptr;
+  }
+
+  /* Advance the head pointer */
+  inline __device__ void advance() {
+    _phase ^= (1 << _rptr);
+    _rptr += 1;
+    if (_rptr >= DEPTH) {
+      _rptr = 0;
+    }
+  }
+
+  inline __device__ int ptr() { return _rptr; }
+
+  inline __device__ uint32_t phase() { return _phase; }
+
+  /* Indicate consumption of data at specified pointer.
+  The producer is now free to overwrite it
+  */
+  inline __device__ void complete(int tid0, int ptr) {
+    if (tid0) {
+      if (CGA_SIZE > 1) {
+        _entryConsumedBarriers.bar_arrive_dsmem(ptr);
+      } else {
+        _entryConsumedBarriers.bar_arrive_normal(ptr);
+      }
+    }
+  }
+
+  /* Simplification of complete and advance for cases
+     where they don't need to be reordered/separated for performance
+  */
+  inline __device__ void pop(int tid0) {
+    complete(tid0, _rptr);
+    advance();
+  }
+
+  /* Overrides for pointer and phase.  Used for shared buffers */
+  inline __device__ void setPtr(int ptr) { _rptr = ptr; }
+
+  inline __device__ void setPhase(uint32_t phase) { _phase = phase; }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int DEPTH, int CGA_SIZE = 1>
+class CircularBuffer {
+ protected:
+  CircularBufferBarriers<DEPTH> _barriers;
+
+ public:
+  inline __device__ void init(int tid0, int producer_thread_count, int consumer_thread_count) {
+    if (tid0) {
+      for (int i = 0; i < DEPTH; i++) {
+        fmha::bar_create(&_barriers.entryProducedBarriers[i], producer_thread_count);
+        fmha::bar_create(&_barriers.entryConsumedBarriers[i], consumer_thread_count);
+      }
+    }
+  }
+
+  using Reader = CircularBufferReader<DEPTH, CGA_SIZE>;
+  using Writer = CircularBufferWriter<DEPTH, CGA_SIZE>;
+
+  inline __device__ Reader createReader() { return Reader(&_barriers); }
+
+  inline __device__ Writer createWriter() { return Writer(&_barriers); }
+
+  inline __device__ int depth() { return DEPTH; }
+
+  CircularBuffer() = default;
+  // CircularBuffer must live in __shared__ -- cannot copy
+  CircularBuffer(CircularBuffer const& other) = delete;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int DEPTH, typename T, int CGA_SIZE>
+class CircularBufferWithDataReader : public CircularBufferReader<DEPTH, CGA_SIZE> {
+ private:
+  T* _data;
+
+ public:
+  inline __device__ CircularBufferWithDataReader(CircularBufferBarriers<DEPTH>* barriers, T* data)
+      : CircularBufferReader<DEPTH, CGA_SIZE>(barriers), _data(data) {}
+
+  inline __device__ T read() { return _data[this->ptr()]; }
+
+  inline __device__ T pop(int tid0, bool read_data = true) {
+    T rval;
+    int ready = this->peek();
+    if (!ready) this->wait();
+    if (read_data) {
+      rval = read();
+      fmha::fence_view_async_shared();
+    }
+    this->complete(tid0, this->ptr());
+    this->advance();
+    return rval;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int DEPTH, typename T, int CGA_SIZE>
+class CircularBufferWithDataWriter : public CircularBufferWriter<DEPTH, CGA_SIZE> {
+ private:
+  T* _data;
+
+ public:
+  inline __device__ CircularBufferWithDataWriter(CircularBufferBarriers<DEPTH>* barriers, T* data)
+      : CircularBufferWriter<DEPTH, CGA_SIZE>(barriers), _data(data) {}
+
+  inline __device__ void write(int ptr, T const& wrdat) { _data[ptr] = wrdat; }
+
+  inline __device__ int push(int tid0, T const& wrdat, bool writeData = true,
+                             uint32_t transactioncnt = 0) {
+    int ptr = this->threadReserve();
+    if (tid0 && writeData) {
+      write(ptr, wrdat);
+      __threadfence_block();
+    }
+    if (transactioncnt == 0)
+      this->threadCommit(tid0, ptr);
+    else
+      this->_entryProducedBarriers.bar_arrive_set_transactioncnt(ptr, transactioncnt, tid0);
+    return ptr;
+  }
+
+  template <int SYNC_BAR, int SYNC_THREADS>
+  inline __device__ int push_with_sync(int tid0, T const& wrdat, bool writeData = true,
+                                       uint32_t transactioncnt = 0) {
+    int ptr = this->threadReserve();
+    named_barrier_wait(SYNC_BAR, SYNC_THREADS);
+    if (tid0 && writeData) {
+      write(ptr, wrdat);
+      __threadfence_block();
+    }
+    if (transactioncnt == 0)
+      this->threadCommit(tid0, ptr);
+    else
+      this->_entryProducedBarriers.bar_arrive_set_transactioncnt(ptr, transactioncnt, tid0);
+    return ptr;
+  }
+
+  inline __device__ void broadcast(T const& wrdat) {
+    int offset = this->threadReserve();
+    for (int i = 0; i < CGA_SIZE; i++) {
+      push_to_cta(wrdat, i, offset);
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int DEPTH, typename T, int CTAS_PER_CGA = 1>
+class CircularBufferWithData : public CircularBuffer<DEPTH, CTAS_PER_CGA> {
+ private:
+  T _data[DEPTH];
+
+ public:
+  inline __device__ T* data() { return _data; }
+
+  using Reader = CircularBufferWithDataReader<DEPTH, T, CTAS_PER_CGA>;
+  using Writer = CircularBufferWithDataWriter<DEPTH, T, CTAS_PER_CGA>;
+
+  inline __device__ Reader createReader() { return Reader(&this->_barriers, _data); }
+
+  inline __device__ Writer createWriter() { return Writer(&this->_barriers, _data); }
+
+  CircularBufferWithData() = default;
+  // Must live in __shared__ -- cannot copy
+  CircularBufferWithData(CircularBufferWithData const& other) = delete;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct OrderedMutex {
+  uint64_t barriers[2];
+
+  inline __device__ void init(int tid0, int threads0, int threads1) {
+    if (tid0) {
+      fmha::bar_create(&barriers[0], threads0);
+      fmha::bar_create(&barriers[1], threads1);
+    }
+  }
+};
+
+class OrderedMutexAccessor {
+ private:
+  int _phase;
+  int _id;
+  int _barrier_id;
+
+  fmha::Arrive_wait _barriers;
+
+ public:
+  inline __device__ OrderedMutexAccessor(OrderedMutex& m, int id, int barrier_id)
+      : _phase(0), _id(id), _barriers(m.barriers), _barrier_id(barrier_id) {}
+
+  inline __device__ void arrive() { _barriers.bar_arrive_normal(_id); }
+
+  inline __device__ void wait() {
+    int ready = _barriers.bar_peek(_id ^ 1, _phase);
+    if (!ready) {
+      _barriers.bar_wait(_id ^ 1, _phase);
+    }
+    _phase ^= 1;
+  }
+
+  inline __device__ void named_bar_arrive() {
+    // ...
+    // Softmax ends
+    // Make sure barrier is not moving around
+    if (_id == 0) {
+      named_barrier_wait(_barrier_id, 256);
+    }
+  }
+
+  inline __device__ void named_bar_wait() {
+    // Make sure barrier is not moving around
+    if (_id == 1) {
+      named_barrier_wait(_barrier_id, 256);
+    }
+    // Softmax starts
+    // ...
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct ComputeGroupBarrier {
+  uint64_t barrier;
+
+  inline __device__ void init(int tid0, int threads) {
+    if (tid0) {
+      fmha::bar_create(&barrier, threads);
+    }
+  }
+};
+
+class ComputeGroupBarrierAccessor {
+ private:
+  int _phase;
+  fmha::Arrive_wait _barrier;
+
+ public:
+  inline __device__ ComputeGroupBarrierAccessor(ComputeGroupBarrier& m)
+      : _phase(0), _barrier(&m.barrier) {}
+
+  inline __device__ void arrive() { _barrier.bar_arrive_normal(0); }
+
+  inline __device__ void wait() {
+    int ready = _barrier.bar_peek(0, _phase);
+    if (!ready) {
+      _barrier.bar_wait(0, _phase);
+    }
+    _phase ^= 1;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace ws
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/warpspec/compute.h b/csrc/fmha_v2/fmha/warpspec/compute.h
new file mode 100644
index 0000000000..9aae70b2e7
--- /dev/null
+++ b/csrc/fmha_v2/fmha/warpspec/compute.h
@@ -0,0 +1,606 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include "fmha/alibi_params.h"
+#include "fmha/hopper/fragment.h"
+#include "fmha/hopper/utils_warpgroup.h"
+#include "fmha/softmax.h"
+#include "fmha/warpspec/circular_buffer.h"
+#include "fmha/warpspec/dma.h"
+#include "fmha/warpspec/epilogue.h"
+
+namespace fmha {
+namespace ws {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // Template instruction traits to specialize structs
+    template <int, int, int, bool, bool> class Instruction_traits,
+    // Kernel Traits
+    typename Kernel_traits>
+struct Compute {
+  // The shared struct.
+  using Shared = typename Kernel_traits::Shared;
+
+  // The q, or kv tile reader.
+  using Circular_buffer_q_reader = typename Kernel_traits::Circular_buffer_q_reader;
+  using Circular_buffer_kv_reader = typename Kernel_traits::Circular_buffer_kv_reader;
+
+  // The instruction traits for BMM1.
+  using Traits_p = typename Kernel_traits::Traits_p;
+  // The instruction traits for BMM2.
+  using Traits_o = typename Kernel_traits::Traits_o;
+
+  // The CTA description for BMM1.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  // The CTA description for BMM2.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The Q shared memory tile.
+  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
+  // The K shared memory tile.
+  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
+  // The V shared memory tile.
+  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+
+  // The GMMA compute tile for BMM1.
+  using Compute_tile_p = typename Kernel_traits::Compute_tile_p;
+  // The GMMA compute tile for BMM2.
+  using Compute_tile_o = typename Kernel_traits::Compute_tile_o;
+
+  // The MMA tile for the BMM1.
+  using Mma_tile_p = typename Kernel_traits::Mma_tile_p;
+  // The MMA tile for the BMM2.
+  using Mma_tile_o = typename Kernel_traits::Mma_tile_o;
+
+  // The fragment of BMM1 output.
+  using Fragment_p = typename Compute_tile_o::Fragment;
+
+  // The global memory tile for storing BMM2 output.
+  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
+
+  // Softmax
+  using Softmax = Softmax<Instruction_traits, Kernel_traits>;
+
+  // BMM2 epilogue
+  using Tile_o_epilogue = Tile_o_epilogue<Instruction_traits, Kernel_traits>;
+
+  // The step size of Q loop.
+  enum { STEP_Q = Kernel_traits::STEP_Q };
+
+  // The step size of KV loop.
+  enum { STEP_KV = Kernel_traits::STEP_KV };
+
+  // The number of compute groups (currently fixed at 2).
+  enum { NUM_COMPUTE_GROUPS = Kernel_traits::NUM_COMPUTE_GROUPS };
+
+  // Whether we skip those masked tiles when causal mask is enabled ?
+  enum { SKIP_CAUSAL_MASK_TILES = Kernel_traits::CAUSAL_MASK && !Kernel_traits::USE_CUSTOM_MASK };
+
+  // Whether we attend to the specific sliding window or chunk ?
+  enum { SLIDING_OR_CHUNKED_ATTENTION = Kernel_traits::SLIDING_OR_CHUNKED_ATTENTION };
+
+  // Are we applying alibi bias (drop FMA optimizations for accuracy reasons).
+  enum { APPLY_ALIBI = Kernel_traits::APPLY_ALIBI };
+
+  // Do we use custom mask input ?
+  enum { USE_CUSTOM_MASK = Kernel_traits::USE_CUSTOM_MASK };
+
+  // Do we always need to apply the mask ?
+  enum { ALWAYS_APPLY_MASK = APPLY_ALIBI || USE_CUSTOM_MASK };
+
+  // Enable mutex for overlapping mma and softmax instructions.
+  enum { ENABLE_MUTEX = Kernel_traits::ENABLE_MUTEX };
+
+  // The head_dimension groups.
+  enum { D_GROUPS = Kernel_traits::D_GROUPS };
+
+  // The MMA_K groups (corresponding to head_dimension groups).
+  enum { BMM1_MMAS_K_GROUPS = Kernel_traits::D_GROUPS };
+
+  // The number of MMAS_K for each head_dimension group.
+  enum { BMM1_MMAS_K_PER_GROUP = Mma_tile_p::MMAS_K / BMM1_MMAS_K_GROUPS };
+
+  // The MMA_K groups (corresponding to kv_step groups).
+  enum { BMM2_MMAS_K_GROUPS = Kernel_traits::BMM2_K_GROUPS };
+
+  // The number of MMAS_K for each head_dimension group.
+  enum { BMM2_MMAS_K_PER_GROUP = Mma_tile_o::MMAS_K / BMM2_MMAS_K_GROUPS };
+
+  // The tile size of V after head_dimension split.
+  enum { TILE_SIZE_V_PER_D_GROUP = STEP_KV * Kernel_traits::D_PER_GROUP };
+
+  enum { TILE_SIZE_V = STEP_KV * Kernel_traits::DV };
+
+  enum { TILE_BYTES_V_PER_D_GROUP = STEP_KV * Kernel_traits::D_BYTES_PER_GROUP };
+
+  enum { TILE_BYTES_V_PER_K_GROUP = BMM2_MMAS_K_PER_GROUP * Kernel_traits::D_BYTES_PER_GROUP };
+
+  // Named barrier for inter-warpgroup sync
+  enum { SYNC_BARRIER = Kernel_traits::MMA_SYNC_BARRIER_ID };
+
+  // Whether Q and KV is in separate buffer, which means we need to consider different Q and KV
+  // lengths.
+  enum { SEPARATE_Q_KV_BUFFER = Kernel_traits::SEPARATE_Q_KV_BUFFER };
+
+  enum { SAGE_BLOCK_SIZE_Q = Kernel_traits::SAGE_BLOCK_SIZE_Q };
+
+  // sanitize 0 to -1, avoid DIV BY ZERO below
+  enum {
+    SAGE_BLOCK_SIZE_K = Kernel_traits::SAGE_BLOCK_SIZE_K > 0 ? Kernel_traits::SAGE_BLOCK_SIZE_K : -1
+  };
+
+  enum {
+    SAGE_BLOCK_SIZE_V = Kernel_traits::SAGE_BLOCK_SIZE_V > 0 ? Kernel_traits::SAGE_BLOCK_SIZE_V : -1
+  };
+
+  // BLOCK_SIZE_Q should be multiply of STEP_Q (usually 64) so that q scale can be fused into
+  // scale_bmm1
+  static_assert(SAGE_BLOCK_SIZE_Q < 0 || SAGE_BLOCK_SIZE_Q % STEP_Q == 0);
+  static_assert(SAGE_BLOCK_SIZE_K < 0 || SAGE_BLOCK_SIZE_K % 8 == 0);  // 8 = columns of a gmma CORE
+  static_assert(SAGE_BLOCK_SIZE_V < 0 ||
+                SAGE_BLOCK_SIZE_V % 32 == 0);  // 32 = K dimension of a qgmma
+
+  // SAGE_BLOCKS_PER_STEP_X is used to declare scale buffer like `float
+  // scales_k[SAGE_BLOCKS_PER_STEP_K];` if SAGE_BLOCKS_PER_STEP_X == 0, you will get `zero-sized
+  // variable is not allowed in device code` error from nvcc, so the minimal value have to be 1. But
+  // don't worry, unused local variables will be optimized out by compiler.
+  enum { SAGE_BLOCKS_PER_STEP_K = std::max(STEP_KV / SAGE_BLOCK_SIZE_K, 1) };
+
+  enum { SAGE_BLOCKS_PER_STEP_V = std::max(STEP_KV / SAGE_BLOCK_SIZE_V, 1) };
+
+#define K_TILE_WAIT()         \
+  int ready_k = cbr_k.peek(); \
+  if (!ready_k) {             \
+    cbr_k.wait();             \
+  }
+
+#define KV_TILE_COMPLETE()                \
+  cbr_k.complete(tidx == 0, cbr_k.ptr()); \
+  cbr_v.complete(tidx == 0, cbr_v.ptr()); \
+  cbr_k.advance();                        \
+  cbr_v.advance();
+
+#define COMPUTE_SINGLE_TILE(IS_FIRST_COL, APPLY_MASK)                                            \
+  compute_single_tile<IS_FIRST_COL, APPLY_MASK>(                                                 \
+      params, ctile_p, softmax, ctile_o, p_max, p_sum, tidx, actual_kv_seqlen, alibi_head_scale, \
+      USE_CUSTOM_MASK ? (head_info.mask_sum_s + q_step_idx * STEP_Q + local_q_tile_offset)       \
+                      : (q_step_idx * STEP_Q + head_info.q_tile_offset),                         \
+      kv_step_idx * STEP_KV, sage_scale_row, cbr, cbr_v, mutex_accessor,                         \
+      kv_step_idx == kv_idx_end - 1);
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+
+  inline __device__ int div_up(int a, int b) { return (a + b - 1) / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Compute the kv_left_mask_end and kv_right_mask_start, where mask is applied when kv_idx <
+  // kv_left_mask_end or kv_idx >= kv_right_mask_start.
+  template <typename Params>
+  inline __device__ std::pair<int, int> compute_kv_mask_start_end(Params const& params,
+                                                                  int const tile_offset_start,
+                                                                  int const tile_offset_end,
+                                                                  int const kv_idx_end) {
+    // The kv_left_mask_end is 0 by default.
+    int kv_left_mask_end = 0;
+    // The kv_right_mask_start is kv_idx_end - 1 by default, which means only the last kv tile is
+    // masked.
+    int kv_right_mask_start = kv_idx_end - 1;
+
+    // Always apply mask is specified.
+    if constexpr (ALWAYS_APPLY_MASK) {
+      return std::make_pair(0, 0);
+    }
+
+    // Is the chunked_attention used ?
+    bool is_chunked_attention = params.log2_chunked_attention_size > 0;
+
+    // The left mask is needed when we attend to a specific sliding window or chunk.
+    if constexpr (SLIDING_OR_CHUNKED_ATTENTION) {
+      // The kv_left_mask_end is the start of the chunk.
+      kv_left_mask_end =
+          div_up(is_chunked_attention ? ((tile_offset_end >> params.log2_chunked_attention_size)
+                                         << params.log2_chunked_attention_size)
+                                      : (tile_offset_end + 1 - params.sliding_window_size),
+                 STEP_KV);
+    }
+
+    // The right mask is needed when causal mask (including sliding_window_attention or chunked
+    // attention) is used.
+    if constexpr (SKIP_CAUSAL_MASK_TILES) {
+      kv_right_mask_start = tile_offset_start / STEP_KV;
+    }
+
+    // Return the kv_left_mask_end and kv_right_mask_start.
+    return std::make_pair(kv_left_mask_end, kv_right_mask_start);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+
+  template <typename Params>
+  inline __device__ void run(int warpgroup_id, int tidx, Shared* shared, Params const& params) {
+    auto head_tracker = shared->head_info_tracker[warpgroup_id].createReader();
+    auto cbr = shared->tma_q_tracker[warpgroup_id].createReader();
+
+    auto cbr_k = shared->tma_k_tracker.createReader();
+    auto cbr_v = shared->tma_v_tracker.createReader();
+
+    // Ctile_p initialize (relies on q_stage, kv_stage).
+    char* smem_q = reinterpret_cast<char*>(&shared->smem_q[warpgroup_id][0]);
+    char* smem_k = reinterpret_cast<char*>(&shared->smem_k[0]);
+    Compute_tile_p ctile_p(smem_q, smem_k);
+
+    // Softmax
+    Softmax softmax(params, tidx);
+
+    // Ctile_o initialize (relies on kv_stage).
+    uint32_t smem_v = __cvta_generic_to_shared(&shared->smem_v[0]);
+    Compute_tile_o ctile_o(0, smem_v);
+
+    // Mutex between two compute groups.
+    OrderedMutexAccessor mutex_accessor(shared->compute_mutex, warpgroup_id, SYNC_BARRIER);
+    // Notify warpgroup 0 to execute HGMMA first (overlap HGMMA and Softmax Math Instructions).
+    if (ENABLE_MUTEX && warpgroup_id == 1 && Kernel_traits::ELEMENT_BYTES == 2) {
+      mutex_accessor.arrive();
+    }
+
+    // While loop for different heads.
+    while (true) {
+      typename Shared::Head_info head_info = head_tracker.pop(true);
+
+      if (head_info.kv_steps == -1) {
+        break;
+      }
+
+      int const kv_steps = head_info.kv_steps;
+      int const q_steps = head_info.q_steps;
+      int const local_q_tile_offset = head_info.local_q_tile_offset;
+      // The global q tile offset (based on past kv cache).
+      // Not used by custom mask input.
+      int const q_tile_offset =
+          SEPARATE_Q_KV_BUFFER ? head_info.q_tile_offset : head_info.local_q_tile_offset;
+      int const actual_q_seqlen = head_info.actual_seqlen;
+      // Contiguous QKV FMHA assumes q, and kv have the same sequence length.
+      int const actual_kv_seqlen =
+          SEPARATE_Q_KV_BUFFER ? head_info.actual_kv_seqlen : actual_q_seqlen;
+
+      // Calculate the alibi head_scaling_factor.
+      float alibi_head_scale = APPLY_ALIBI ? get_alibi_head_scaling_factor<AlibiParams>(
+                                                 head_info.bidh, params.alibi_params)
+                                           : 0.f;
+      // pre-compute the row of the scale for reuse
+      int sage_scale_row;
+      if constexpr (Kernel_traits::SAGE_ATTENTION) {
+        sage_scale_row = head_info.bidb * params.h + head_info.bidh;
+      }
+
+      // BMM2 epilogue
+      Tile_o_epilogue tile_o_epilogue(params, head_info);
+
+      int q_step_idx = warpgroup_id;
+
+      // Compute work.
+      for (; q_step_idx < q_steps; q_step_idx += NUM_COMPUTE_GROUPS) {
+        // Check whether it is a valid run of q steps.
+        int const q_offset = q_step_idx * STEP_Q + local_q_tile_offset;
+        bool const valid_run = q_offset < actual_q_seqlen;
+        // fuse the scale of q into scale_bmm1
+        if constexpr (SAGE_BLOCK_SIZE_Q > 0) {
+          // I tried another implementation here: store original `scale_bmm1` to a local variable
+          // to avoid frequent `__ldg`. But experiment shows that the current one is faster.
+          // A bit counterintuitive.
+          auto const scale_bmm1 =
+              params.scale_bmm1_d ? __ldg(params.scale_bmm1_d) : params.scale_bmm1;
+          int const idx = sage_scale_row * params.sage.q.max_nblock + q_offset / SAGE_BLOCK_SIZE_Q;
+          *(float*)(&softmax.scale_bmm1_) =
+              reinterpret_cast<float const&>(scale_bmm1) * __ldg(&params.sage.q.scales[idx]);
+        }
+
+        // KV tile is shared by two q tiles,
+        // so we need to consider the last compute group's q tile.
+        int const tile_offset_start = q_step_idx * STEP_Q + q_tile_offset;
+        int const tile_offset_end = tile_offset_start + STEP_Q - 1;
+        int const warpgroup_tile_offset_start = tile_offset_start - warpgroup_id * STEP_Q;
+        int const warpgroup_tile_offset_end =
+            tile_offset_start + (NUM_COMPUTE_GROUPS - warpgroup_id) * STEP_Q - 1;
+
+        // Compute the kv_idx start (inclusive) and end (exclusive).
+        auto const [kv_idx_start, kv_idx_end] = DMA<Kernel_traits>::Device::compute_kv_tile_idx(
+            params, warpgroup_tile_offset_start, warpgroup_tile_offset_end, kv_steps);
+
+        // Compute the kv_left_mask_end and kv_right_mask_start, where mask is applied when kv_idx <
+        // kv_left_mask_end or kv_idx >= kv_right_mask_start.
+        auto const [kv_left_mask_end, kv_right_mask_start] =
+            compute_kv_mask_start_end(params, tile_offset_start, tile_offset_end, kv_idx_end);
+
+        // The gmem O tile.
+        Gmem_tile_o gmem_o(params, head_info, *shared, tidx,
+                           q_step_idx * STEP_Q + local_q_tile_offset);
+
+        // Q ready to use in smem.
+        int ready = cbr.peek();
+        if (!ready) {
+          cbr.wait();
+        }
+
+        static_assert(Mma_tile_p::CORES_M == 2);
+        float p_max[Mma_tile_p::CORES_M];
+        float p_sum[Mma_tile_p::CORES_M];
+
+        int kv_step_idx = kv_idx_start;
+        // First K tiles ready to use in smem.
+        K_TILE_WAIT();
+        // Need to apply mask if only kv tile exists.
+        if (kv_idx_start < kv_left_mask_end || kv_idx_start >= kv_right_mask_start) {
+          COMPUTE_SINGLE_TILE(true, true);
+        } else {
+          COMPUTE_SINGLE_TILE(true, false);
+        }
+        KV_TILE_COMPLETE();
+
+        for (kv_step_idx += 1; kv_step_idx < kv_right_mask_start; ++kv_step_idx) {
+          // Current step's K tiles ready to use in smem.
+          K_TILE_WAIT();
+
+          // Move kv tile to next buffer.
+          if (D_GROUPS > 1) {
+            ctile_p.increment_gmma_desc_group();
+          } else {
+            ctile_p.increment_gmma_desc_b_group();
+          }
+
+          ctile_o.increment_gmma_desc_group();
+
+          // Apply the start mask only when sliding window attention is enabled.
+          if (kv_step_idx < kv_left_mask_end) {
+            COMPUTE_SINGLE_TILE(false, true);
+          } else {
+            COMPUTE_SINGLE_TILE(false, false);
+          }
+
+          KV_TILE_COMPLETE();
+        }
+
+        // Always apply the mask in the end.
+        for (; kv_step_idx < kv_idx_end; ++kv_step_idx) {
+          // Current step's K tiles ready to use in smem.
+          K_TILE_WAIT();
+
+          // Move kv tile to next buffer.
+          if (D_GROUPS > 1) {
+            ctile_p.increment_gmma_desc_group();
+          } else {
+            ctile_p.increment_gmma_desc_b_group();
+          }
+
+          ctile_o.increment_gmma_desc_group();
+
+          COMPUTE_SINGLE_TILE(false, true);
+
+          KV_TILE_COMPLETE();
+        }
+        if (valid_run) {
+          // Final step's update.
+          tile_o_epilogue.scale(ctile_o, p_max, p_sum);
+          // Store o_tile to gmem.
+          gmem_o.store(ctile_o.acc_);
+        }
+
+        // Move q, kv to next buffer.
+        ctile_p.increment_gmma_desc_a_group();
+        ctile_p.increment_gmma_desc_b_group();
+        ctile_o.increment_gmma_desc_group();
+
+        if constexpr (Kernel_traits::RETURN_SOFTMAX_STATS) {
+          using Mma_tile = typename Traits_p::template Mma_tile<Cta_tile_o>;
+          fmha::Softmax_saver_tma<Cta_tile_o, Mma_tile> saver(params, head_info);
+          saver.store(p_sum, p_max, sqrtf(params.d), q_step_idx * STEP_Q, valid_run);
+        }
+      }
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+
+  template <bool IS_FIRST_COL, bool APPLY_MASK, typename Params>
+  inline __device__ void compute_single_tile(
+      Params params, Compute_tile_p& ctile_p, Softmax& softmax, Compute_tile_o& ctile_o,
+      float (&p_max)[Mma_tile_p::CORES_M], float (&p_sum)[Mma_tile_p::CORES_M], int const tidx,
+      int const actual_kv_seqlen, float const alibi_head_scale, int const row_offset,
+      int const col_offset, int const sage_scale_row, Circular_buffer_q_reader& cbr,
+      Circular_buffer_kv_reader& cbr_v, OrderedMutexAccessor& mutex, bool complete = false) {
+// load the scales of K/V from global memory
+#define LOAD_SCALES_KV(dst, which, blocks_per_step, block_size)                            \
+  if constexpr (block_size > 0) {                                                          \
+    const int _start = col_offset / block_size;                                            \
+    const float* _src =                                                                    \
+        params.sage.which.scales + sage_scale_row * params.sage.which.max_nblock + _start; \
+    const int _end = params.sage.which.max_nblock - _start;                                \
+    _Pragma("unroll") for (int _i = 0; _i < blocks_per_step; _i++) {                       \
+      dst[_i] = _i < _end ? _src[_i] : 1.0f;                                               \
+    }                                                                                      \
+  }
+
+#define LOAD_SCALES_K(scales) LOAD_SCALES_KV(scales, k, SAGE_BLOCKS_PER_STEP_K, SAGE_BLOCK_SIZE_K)
+
+#define LOAD_SCALES_V(scales) LOAD_SCALES_KV(scales, v, SAGE_BLOCKS_PER_STEP_V, SAGE_BLOCK_SIZE_V)
+
+    // Load the needed packed masks.
+    softmax.load_packed_mask(row_offset, col_offset);
+
+    // experiments show that here is the best place to load scales of K
+    float scales_k[SAGE_BLOCKS_PER_STEP_K];
+    LOAD_SCALES_K(scales_k)
+
+    // Wait until another warpgroup has already executed HGMMA.
+    if constexpr (ENABLE_MUTEX && Kernel_traits::ELEMENT_BYTES == 2) {
+      mutex.wait();
+    }
+
+    // Ctile_p is only used once by each n step.
+    ctile_p.clear();
+
+    // BMM1 (Q x K').
+    warpgroup_arrive();
+
+// Only single K groups when sizeof(D) <= 128B.
+#pragma unroll
+    for (int kbi = 0; kbi < BMM1_MMAS_K_GROUPS - 1; kbi++) {
+#pragma unroll
+      for (int ki = 0; ki < BMM1_MMAS_K_PER_GROUP; ki++) {
+        ctile_p.compute(ki, false, ki == BMM1_MMAS_K_PER_GROUP - 1);
+      }
+      ctile_p.increment_gmma_desc_group();
+    }
+
+#pragma unroll
+    for (int ki = 0; ki < BMM1_MMAS_K_PER_GROUP - 1; ki++) {
+      ctile_p.compute(ki);
+    }
+
+    ctile_p.compute(BMM1_MMAS_K_PER_GROUP - 1, true, true);
+
+    warpgroup_commit();
+    warpgroup_wait<0>();
+
+    // Arrive when the last tile consumes the q tile.
+    if (complete) {
+      cbr.complete(tidx == 0, cbr.ptr());
+      cbr.advance();
+    }
+
+    if constexpr (ENABLE_MUTEX && Kernel_traits::ELEMENT_BYTES == 2) {
+      // Notify another warpgroup to execute HGMMA.
+      mutex.arrive();
+    }
+    if constexpr (ENABLE_MUTEX && Kernel_traits::ELEMENT_BYTES == 1) {
+      // Wait until another warpgroup has already executed QGMMA.
+      mutex.named_bar_wait();
+    }
+
+    // Fragment p for BMM2 input
+    Fragment_p frag_p[Mma_tile_o::MMAS_K];
+
+    // Unpack the elements from bmm1 output to floats.
+    softmax.unpack(ctile_p);
+    // apply the scales of K before softmax
+    if constexpr (SAGE_BLOCK_SIZE_K > 0) {
+#pragma unroll
+      for (int ni = 0; ni < Mma_tile_p::CORES_N; ni++) {
+        float const scale_k = scales_k[SAGE_BLOCKS_PER_STEP_K * ni / Mma_tile_p::CORES_N];
+#pragma unroll
+        for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++) {
+          softmax.elt_[mi][2 * ni] *= scale_k;
+          softmax.elt_[mi][2 * ni + 1] *= scale_k;
+        }
+      }
+    }
+
+    // Apply the alibi and mask.
+    softmax.apply_alibi_and_mask<APPLY_MASK>(ctile_p, params.alibi_params, alibi_head_scale,
+                                             actual_kv_seqlen, row_offset, col_offset);
+
+    // Softmax Exp, max/sum, and update scales.
+    softmax.compute_and_update_scale<IS_FIRST_COL>(p_max, p_sum);
+
+    // experiments show that here is the best place to load scales of V
+    float scales_v[SAGE_BLOCKS_PER_STEP_V];
+    LOAD_SCALES_V(scales_v)
+
+    // Update flash attention scales and pack it for BMM2
+    softmax.pack<IS_FIRST_COL>(ctile_o, frag_p);
+
+    if constexpr (ENABLE_MUTEX && Kernel_traits::ELEMENT_BYTES == 1) {
+      // Notify another warpgroup to execute QGMMA.
+      mutex.named_bar_arrive();
+    }
+
+    // Wait until v buffer is ready.
+    int ready = cbr_v.peek();
+    if (!ready) {
+      cbr_v.wait();
+    }
+
+    warpgroup_arrive();
+
+    float last_scale_v;
+
+// Apply the scale of V to partial result.
+// Note 2 points:
+// 1. Because the matrix V is quantized along the inner dimension, it is necessary to interrupt
+//   the MMA workflow after processing each BLOCKS_SIZE_V rows of V and scale the intermediate
+//   results once. For example, STEP_KV=256, qgmma.K=32, then 256/32=8 MMAs are needs,
+//   so mma_ki = [0,1,2, ..., 7]. If the BLOCK_SIZE_V=64, then after each 2 qgmmas we should scale
+//   ctile_o.
+// 2. The ctile_o is all zero at the beginning. if we directly apply the scale of V after each 2
+//   qgmmas, let's see what happens:
+//     ctile_o = [0]
+//     ctile_o = (ctile_o + P0 x V0) * s0 = P0 x V0 * s0
+//     ctile_o = (ctile_o + P1 x V1) * s1 = P0 x V0 * s0 * s1 + P1 x V1 * s1
+//     ctile_o = (ctile_o + P2 x V2) * s2 = P0 x V0 * s0 * s1 * s2 + P1 x V1 * s1 * s2 + P2 x V2 *
+//     s2
+//     ...
+//   As you see, the actual scale of a V block is the cumulative product of the scales of all
+//   later blocks. To solve this, we have to preprocess the scale s[i] of block[i] to s[i]/s[i+1],
+//   and the final block uses the actual scale.
+// But to fetch the next scale in next STEP leads to bad performance. So we apply s[i-1]/s[i] to
+// current partial result BEFORE each V block.
+#define APPLY_SCALE_V(mma_ki)                                                          \
+  if constexpr (SAGE_BLOCK_SIZE_V > 0) {                                               \
+    if (mma_ki % (Mma_tile_o::MMAS_K / SAGE_BLOCKS_PER_STEP_V) == 0) {                 \
+      float _scale_v = scales_v[SAGE_BLOCKS_PER_STEP_V * mma_ki / Mma_tile_o::MMAS_K]; \
+      if (mma_ki != 0) {                                                               \
+        warpgroup_commit();                                                            \
+        warpgroup_wait<0>();                                                           \
+      }                                                                                \
+      last_scale_v = _scale_v;                                                         \
+    }                                                                                  \
+  }
+
+// BMM2 (S * V).
+#pragma unroll
+    for (int kbi = 0; kbi < BMM2_MMAS_K_GROUPS - 1; kbi++) {
+#pragma unroll
+      for (int ki = 0; ki < BMM2_MMAS_K_PER_GROUP; ++ki) {
+        int const mma_ki = kbi * BMM2_MMAS_K_PER_GROUP + ki;
+        APPLY_SCALE_V(mma_ki)
+        ctile_o.fill_frag_a(frag_p[mma_ki]);
+        ctile_o.compute(ki, false, ki == BMM2_MMAS_K_PER_GROUP - 1);
+      }
+      ctile_o.increment_gmma_desc_group();
+    }
+
+#pragma unroll
+    for (int ki = 0; ki < BMM2_MMAS_K_PER_GROUP - 1; ++ki) {
+      int const mma_ki = (BMM2_MMAS_K_GROUPS - 1) * BMM2_MMAS_K_PER_GROUP + ki;
+      APPLY_SCALE_V(mma_ki)
+      ctile_o.fill_frag_a(frag_p[mma_ki]);
+      ctile_o.compute(ki);
+    }
+
+    APPLY_SCALE_V((Mma_tile_o::MMAS_K - 1))
+    ctile_o.fill_frag_a(frag_p[Mma_tile_o::MMAS_K - 1]);
+    ctile_o.compute(Mma_tile_o::MMAS_K - 1, true, true);
+
+    warpgroup_commit();
+    warpgroup_wait<0>();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace ws
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/warpspec/dma.h b/csrc/fmha_v2/fmha/warpspec/dma.h
new file mode 100644
index 0000000000..a14ccafdf3
--- /dev/null
+++ b/csrc/fmha_v2/fmha/warpspec/dma.h
@@ -0,0 +1,874 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+#include <fmha/hopper/tma_descriptor.h>
+#include <fmha/hopper/tma_types.h>
+#include <fmha/hopper/utils_tma.h>
+#include <fused_multihead_attention_kernel.h>
+
+#include "fmha/hopper/arrive_wait.h"
+#include "fmha/hopper/smem_tile.h"
+#include "fmha/utils.h"
+
+namespace fmha {
+namespace ws {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_traits>
+struct DMA {
+  // The shared struct.
+  using Shared = typename Kernel_traits::Shared;
+  // The kv buffer writer.
+  using Circular_buffer_kv_writer = typename Kernel_traits::Circular_buffer_kv_writer;
+  using Circular_buffer_v_scratch_reader = typename Kernel_traits::Circular_buffer_v_scratch_reader;
+  using Circular_buffer_v_scratch_writer = typename Kernel_traits::Circular_buffer_v_scratch_writer;
+
+  // The step size of Q loop.
+  enum { STEP_Q = Kernel_traits::STEP_Q };
+
+  // The step size of KV loop.
+  enum { STEP_KV = Kernel_traits::STEP_KV };
+
+  // The tile size of Q.
+  enum { TILE_SIZE_Q = STEP_Q * Kernel_traits::D };
+
+  // The tile size of Q after head_dimension split.
+  enum { TILE_SIZE_Q_PER_D_GROUP = STEP_Q * Kernel_traits::D_PER_GROUP };
+
+  // The tile size of K.
+  enum { TILE_SIZE_K = STEP_KV * Kernel_traits::D };
+
+  // The tile size of K after head_dimension split.
+  enum { TILE_SIZE_K_PER_D_GROUP = STEP_KV * Kernel_traits::D_PER_GROUP };
+
+  // The tile size of V.
+  enum { TILE_SIZE_V = STEP_KV * Kernel_traits::DV };
+
+  // The tile size of V after head_dimension split.
+  enum { TILE_SIZE_V_PER_D_GROUP = TILE_SIZE_K_PER_D_GROUP };
+
+  // Whether apply causal mask or not.
+  enum { CAUSAL_MASK = Kernel_traits::CAUSAL_MASK };
+
+  // Whether use custom mask input or not.
+  enum { USE_CUSTOM_MASK = Kernel_traits::USE_CUSTOM_MASK };
+
+  // Whether we skip those masked tiles when causal mask is enabled ?
+  enum { SKIP_CAUSAL_MASK_TILES = CAUSAL_MASK && !USE_CUSTOM_MASK };
+
+  // Whether we attend to the specific sliding window or chunk ?
+  enum { SLIDING_OR_CHUNKED_ATTENTION = Kernel_traits::SLIDING_OR_CHUNKED_ATTENTION };
+
+  // Is heads interleaved ?
+  enum { HEADS_INTERLEAVED = Kernel_traits::HEADS_INTERLEAVED };
+
+  // Named barrier for inter-warpgroup sync
+  enum { SYNC_BARRIER = Kernel_traits::DMA_SYNC_BARRIER_ID };
+
+  // The number of compute groups (currently fixed at 2).
+  enum { NUM_COMPUTE_GROUPS = Kernel_traits::NUM_COMPUTE_GROUPS };
+
+  // The tile scheduling mode: static (0), dynamic (1)
+  enum { SCHEDULING_MODE = Kernel_traits::SCHEDULING_MODE };
+
+  // Whether read from paged kv buffers or not.
+  enum { PAGED_KV_INPUT = Kernel_traits::PAGED_KV_INPUT };
+
+  // Whether the dma group transposes the v tile explicitly.
+  enum { DMA_GROUP_TRANSPOSE_V = Kernel_traits::DMA_GROUP_TRANSPOSE_V };
+
+  // How many threads get involved in the dma group.
+  enum { NUM_THREADS_IN_DMA_GROUP = Kernel_traits::NUM_THREADS_IN_DMA_GROUP };
+
+  // Transpose V
+  // K is the sequence length dimension (128 for GMMA). The unroll factor is decided according to
+  // empirical evidence so as to avoid register spill.
+  enum { K_ = STEP_KV % 128 == 0 ? 128 : 64 };
+
+  static_assert(STEP_KV % K_ == 0);
+  using Transposer =
+      Transposer<typename Kernel_traits::Traits_o, typename Kernel_traits::Cta_tile_o, K_,
+                 (STEP_KV > 128 || SLIDING_OR_CHUNKED_ATTENTION) ? 1 : 2 /* UNROLL */>;
+
+  struct Device {
+    // Only the warpgroup leader initiates mbarriers & TMA operations.
+    uint32_t elect_one_;
+    // The sum_s for q.
+    int sum_s_q_;
+    // The sum_s for kv.
+    int sum_s_kv_;
+    // Tile id for q tile scheduling
+    uint32_t tile_id_;
+
+    inline __device__ Device(uint32_t elect_one) : elect_one_(elect_one) {}
+
+    ////////////////////////////////////////////////////////////////////////////////////////////
+
+    // Compute the kv tile idx start (inclusive) and end (exclusive).
+    static inline __device__ std::pair<int, int> compute_kv_tile_idx(
+        bert::Fused_multihead_attention_params_v2 const& params, int q_step_offset, int q_step_end,
+        int kv_steps) {
+      // The default kv_idx_start and kv_idx_end (exclusive).
+      int kv_idx_start = 0;
+      int kv_idx_end = kv_steps;
+
+      // Is the chunked_attention used ?
+      bool is_chunked_attention = params.log2_chunked_attention_size > 0;
+
+      // Skip initial kv tiles due to sliding_window_size
+      if (SLIDING_OR_CHUNKED_ATTENTION) {
+        // The kv_offset_start.
+        int kv_offset_start = is_chunked_attention
+                                  ? ((q_step_offset >> params.log2_chunked_attention_size)
+                                     << params.log2_chunked_attention_size)
+                                  : max(0, q_step_offset + 1 - params.sliding_window_size);
+        kv_idx_start = kv_offset_start / STEP_KV;
+      }
+
+      // Early stop when causal mask is enabled.
+      if (SKIP_CAUSAL_MASK_TILES) {
+        kv_idx_end = (q_step_end + STEP_KV - 1) / STEP_KV;
+      }
+
+      return std::make_pair(kv_idx_start, kv_idx_end);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////////////////
+
+    // Packed contiguous QKV input.
+    inline __device__ void run_packed_qkv(bert::Fused_multihead_attention_params_v2 const& params,
+                                          Shared* shared) {
+      // DMA.
+      int local_wid = (threadIdx.x / 32) % 4;
+      int tiw = threadIdx.x % 32;
+      uint32_t smem_tile_id = __cvta_generic_to_shared(&shared->tile_id);
+
+      if (SCHEDULING_MODE == 0) {
+        tile_id_ = blockIdx.y;
+      } else {
+        get_next_tile_id(local_wid, tiw, smem_tile_id, params.tile_id_counter_ptr);
+      }
+
+      auto cbw0 = shared->tma_q_tracker[0].createWriter();
+      auto cbw1 = shared->tma_q_tracker[1].createWriter();
+      Circular_buffer_kv_writer cbw_k = shared->tma_k_tracker.createWriter();
+      Circular_buffer_kv_writer cbw_v = shared->tma_v_tracker.createWriter();
+      Circular_buffer_v_scratch_reader cbr_v_scratch = shared->tma_v_scratch_tracker.createReader();
+      Circular_buffer_v_scratch_writer cbw_v_scratch = shared->tma_v_scratch_tracker.createWriter();
+      auto headinfo_tracker0 = shared->head_info_tracker[0].createWriter();
+      auto headinfo_tracker1 = shared->head_info_tracker[1].createWriter();
+
+      while (tile_id_ < params.num_tiles) {
+        // If we do bidh = next_head % h, we'd guarantee b to be spread across CTAs.
+
+        int bidb, tmp, bidh, q_step_offset, q_steps;
+
+        if (SCHEDULING_MODE == 0) {
+          bidh = tile_id_ % params.h;
+          bidb = tile_id_ / params.h;
+        } else {
+          // Balanced dynamic scheduling
+          if (CAUSAL_MASK && !SLIDING_OR_CHUNKED_ATTENTION && params.use_balanced_scheduling) {
+            q_step_offset = (params.num_tiles_per_head - 1 - tile_id_ / (params.b * params.h)) *
+                            NUM_COMPUTE_GROUPS;
+            tmp = tile_id_ % (params.b * params.h);
+            bidh = tmp / params.b;
+            bidb = tmp % params.b;
+            q_steps = NUM_COMPUTE_GROUPS;
+          } else {  // Unbalanced dynamic scheduling
+            bidb = tile_id_ / (params.h * params.num_tiles_per_head);
+            tmp = tile_id_ % (params.h * params.num_tiles_per_head);
+            bidh = tmp / params.num_tiles_per_head;
+            q_step_offset = tmp % params.num_tiles_per_head * NUM_COMPUTE_GROUPS;
+            q_steps = NUM_COMPUTE_GROUPS;
+          }
+        }
+
+        cudaTmaDesc const* desc_q = &params.tma_desc_q;
+        cudaTmaDesc const* desc_k = &params.tma_desc_k;
+        cudaTmaDesc const* desc_v = &params.tma_desc_v;
+        int actual_seqlen;
+        if (params.is_s_padded) {
+          sum_s_q_ = bidb * params.s;
+          actual_seqlen = params.cu_q_seqlens[bidb + 1] - params.cu_q_seqlens[bidb];
+        } else {
+          sum_s_q_ = params.cu_q_seqlens[bidb];
+          actual_seqlen = params.cu_q_seqlens[bidb + 1] - sum_s_q_;
+        }
+        sum_s_kv_ = sum_s_q_;
+
+        // The cumulative packed_mask seqlens.
+        // Each sequence length in the batch has to be padded to multiple of 128.
+        int sum_mask_s = params.cu_mask_rows[bidb];
+
+        if (SCHEDULING_MODE == 0) {
+          // split work across M
+          q_steps = (actual_seqlen + STEP_Q - 1) / STEP_Q;
+
+          // Q_steps may be distributed to multiple blocks to increase the occupacy
+          // when b*h is small.
+          // The number of q_steps needs to be multiple of 2.
+          q_steps = (q_steps + gridDim.x - 1) / gridDim.x;
+          q_steps += (q_steps & 1);
+          // The last block may process fewer q_steps.
+          q_step_offset = q_steps * blockIdx.x;
+        }
+
+        int q_tile_offset = q_step_offset * STEP_Q;
+        if (q_tile_offset >= actual_seqlen) {
+          if (SCHEDULING_MODE == 0) {
+            tile_id_ += gridDim.y;
+          } else {
+            get_next_tile_id(local_wid, tiw, smem_tile_id, params.tile_id_counter_ptr);
+          }
+          continue;
+        }
+
+        // Split work across N.
+        int const kv_steps = (actual_seqlen + STEP_KV - 1) / STEP_KV;
+        for (int q_step_idx = 0; q_step_idx < q_steps; q_step_idx += 2) {
+          load_q(bidh, (q_step_idx + 0 + q_step_offset) * STEP_Q, desc_q, shared->smem_q[0], cbw0);
+          load_q(bidh, (q_step_idx + 1 + q_step_offset) * STEP_Q, desc_q, shared->smem_q[1], cbw1);
+
+          // Q step bound is 2 tiles away at this moment because of 2x1 math warpgroup
+          int const q_step_end = (q_step_idx + q_step_offset + 2) * STEP_Q - 1;
+
+          // The kv tile idx range for this q step.
+          auto const [kv_idx_start, kv_idx_end] = compute_kv_tile_idx(
+              params, (q_step_idx + q_step_offset) * STEP_Q, q_step_end, kv_steps);
+
+          // Iterate over the kv tiles for this q step.
+          for (int kv_step_idx = kv_idx_start; kv_step_idx < kv_idx_end; kv_step_idx++) {
+            int bar_id = load_kv(bidh / params.h_q_per_kv, kv_step_idx * STEP_KV, desc_k, desc_v,
+                                 shared, cbw_k, cbw_v, cbw_v_scratch);
+
+            // Opportunistically hide headinfo in the shadow of UTMALDGs of the QKV tensor
+            if (q_step_idx == 0 && kv_step_idx == kv_idx_start) {
+              // Send head info.
+              typename Shared::Head_info info{
+                  q_steps,
+                  // q, and kv have the same length.
+                  q_tile_offset, USE_CUSTOM_MASK ? sum_mask_s : q_tile_offset, kv_steps,
+                  // q, and kv have the same length.
+                  actual_seqlen, actual_seqlen, sum_s_q_ * params.h + bidh, bidh, bidb};
+              // NOTE(tizheng): The need for the sync after consumer bar wait is to avoid a deadlock
+              // hazard when DMA thread 0 is ahead of other DMA threads. For example: DMA thread 0
+              // have finished consumer bar wait phase 0 and producer bar arrive phase 0, and then
+              // MMA warps have finished producer bar wait phase 0 and consumer bar arrive phase 1.
+              // At this time other DMA threads start consumer bar wait phase 0. It will never
+              // become ready. DMA warps then fail to continue to the next loop.
+              //
+              // It is the same consideration for the sync after tmaReserve in load_q and load_kv
+              // implementation below.
+              headinfo_tracker0.template push_with_sync<SYNC_BARRIER, NUM_THREADS_IN_DMA_GROUP>(
+                  elect_one_, info);
+              headinfo_tracker1.template push_with_sync<SYNC_BARRIER, NUM_THREADS_IN_DMA_GROUP>(
+                  elect_one_, info);
+            }
+
+            if constexpr (DMA_GROUP_TRANSPOSE_V) {
+              transpose_v_tile(bar_id, shared, cbw_v, cbr_v_scratch);
+            }
+          }  // kv
+        }  // q
+
+        if (SCHEDULING_MODE == 0) {
+          tile_id_ += gridDim.y;
+        } else {
+          get_next_tile_id(local_wid, tiw, smem_tile_id, params.tile_id_counter_ptr);
+        }
+      }  // gridDim.y
+      // Signal compute groups to break.
+      headinfo_tracker0.template push_with_sync<SYNC_BARRIER, NUM_THREADS_IN_DMA_GROUP>(
+          elect_one_, {-1, -1, -1, -1, -1, -1, -1, -1});
+      headinfo_tracker1.template push_with_sync<SYNC_BARRIER, NUM_THREADS_IN_DMA_GROUP>(
+          elect_one_, {-1, -1, -1, -1, -1, -1, -1, -1});
+    }
+
+    // Support contiguous Q + contiguous/paged KV separate cache.
+    inline __device__ void run_separate_q_and_kv(
+        bert::Fused_multihead_attention_params_v2 const& params, Shared* shared) {
+      // DMA.
+      int local_wid = (threadIdx.x / 32) % 4;
+      int tiw = threadIdx.x % 32;
+      uint32_t smem_tile_id = __cvta_generic_to_shared(&shared->tile_id);
+
+      if (SCHEDULING_MODE == 0) {
+        tile_id_ = blockIdx.y;
+      } else {
+        get_next_tile_id(local_wid, tiw, smem_tile_id, params.tile_id_counter_ptr);
+      }
+
+      auto cbw0 = shared->tma_q_tracker[0].createWriter();
+      auto cbw1 = shared->tma_q_tracker[1].createWriter();
+      Circular_buffer_kv_writer cbw_k = shared->tma_k_tracker.createWriter();
+      Circular_buffer_kv_writer cbw_v = shared->tma_v_tracker.createWriter();
+      Circular_buffer_v_scratch_reader cbr_v_scratch = shared->tma_v_scratch_tracker.createReader();
+      Circular_buffer_v_scratch_writer cbw_v_scratch = shared->tma_v_scratch_tracker.createWriter();
+      auto headinfo_tracker0 = shared->head_info_tracker[0].createWriter();
+      auto headinfo_tracker1 = shared->head_info_tracker[1].createWriter();
+
+      while (tile_id_ < params.num_tiles) {
+        // If we do bidh = next_head % h, we'd guarantee b to be spread across CTAs.
+
+        int bidb, tmp, bidh, local_q_tile_offset, q_steps;
+
+        if (SCHEDULING_MODE == 0) {
+          bidh = tile_id_ % params.h;
+          bidb = tile_id_ / params.h;
+        } else if (SCHEDULING_MODE == 1) {
+          bidb = tile_id_ / (params.h * params.num_tiles_per_head);
+          tmp = tile_id_ % (params.h * params.num_tiles_per_head);
+          bidh = tmp / params.num_tiles_per_head;
+          local_q_tile_offset = (tmp % params.num_tiles_per_head) * NUM_COMPUTE_GROUPS * STEP_Q;
+          q_steps = NUM_COMPUTE_GROUPS;
+        } else {  // SCHEDULING_MODE == 2
+          local_q_tile_offset = (params.num_tiles_per_head - 1 - tile_id_ / (params.b * params.h)) *
+                                NUM_COMPUTE_GROUPS * STEP_Q;
+          tmp = tile_id_ % (params.b * params.h);
+          bidh = tmp / params.b;
+          bidb = tmp % params.b;
+          q_steps = NUM_COMPUTE_GROUPS;
+        }
+        int bidh_kv = bidh / params.h_q_per_kv;
+
+        // Sequence length parameters.
+        // Take chunked attention (q, and kv may have difference sequence length) into
+        // consideration.
+        sum_s_q_ = params.is_s_padded ? bidb * params.s : params.cu_q_seqlens[bidb];
+        sum_s_kv_ = params.is_s_padded ? bidb * params.s : params.cu_kv_seqlens[bidb];
+        int actual_q_seqlen = params.cu_q_seqlens[bidb + 1] - params.cu_q_seqlens[bidb];
+        int actual_kv_seqlen = params.cu_kv_seqlens[bidb + 1] - params.cu_kv_seqlens[bidb];
+        int past_kv_length = actual_kv_seqlen - actual_q_seqlen;
+
+        // The cumulative packed_mask seqlens.
+        // Each sequence length in the batch has to be padded to multiple of 128.
+        int sum_mask_s = params.cu_mask_rows[bidb];
+
+        // Prepare the tma descriptors.
+        cudaTmaDesc const* desc_q = &params.tma_desc_q;
+        cudaTmaDesc const* desc_k = &params.tma_desc_k;
+        cudaTmaDesc const* desc_v = &params.tma_desc_v;
+
+        int32_t const* paged_block_offsets =
+            params.paged_kv_cache.mBlockOffsets + bidb * 2 * params.paged_kv_cache.mMaxBlocksPerSeq;
+
+        if (SCHEDULING_MODE == 0) {
+          // split work across M
+          q_steps = (actual_q_seqlen + STEP_Q - 1) / STEP_Q;
+
+          // Q_steps may be distributed to multiple blocks to increase the occupacy
+          // when b*h is small.
+          // The number of q_steps needs to be multiple of 2.
+          q_steps = (q_steps + gridDim.x - 1) / gridDim.x;
+          q_steps += (q_steps & 1);
+          local_q_tile_offset = q_steps * blockIdx.x * STEP_Q;
+        }
+
+        // The last block may process fewer q_steps.
+        if (local_q_tile_offset >= actual_q_seqlen) {
+          if (SCHEDULING_MODE == 0) {
+            tile_id_ += gridDim.y;
+          } else {
+            get_next_tile_id(local_wid, tiw, smem_tile_id, params.tile_id_counter_ptr);
+          }
+          continue;
+        }
+
+        // The global q tile offset which includes the past kv cache.
+        int q_tile_offset = local_q_tile_offset + past_kv_length;
+        // Split work across N.
+        int const kv_steps = (actual_kv_seqlen + STEP_KV - 1) / STEP_KV;
+        // Page KV: number of valid kv blocks (others might be nullptr).
+        int const num_valid_kv_blocks =
+            (actual_kv_seqlen + params.paged_kv_cache.mTokensPerBlock - 1) >>
+            params.paged_kv_cache.mTokensPerBlockLog2;
+
+        for (int q_step_idx = 0; q_step_idx < q_steps && actual_kv_seqlen > 0; q_step_idx += 2) {
+          load_q(bidh, q_step_idx * STEP_Q + local_q_tile_offset, desc_q, shared->smem_q[0], cbw0);
+          load_q(bidh, (q_step_idx + 1) * STEP_Q + local_q_tile_offset, desc_q, shared->smem_q[1],
+                 cbw1);
+
+          // Q step end is 2 tiles away at this moment because of 2x1 math warpgroup
+          int const q_step_end = (q_step_idx + 2) * STEP_Q - 1 + q_tile_offset;
+
+          // The kv tile idx range for this q step.
+          auto const [kv_idx_start, kv_idx_end] = compute_kv_tile_idx(
+              params, q_step_idx * STEP_Q + q_tile_offset, q_step_end, kv_steps);
+
+          // Iterate over the kv tiles for this q step.
+          for (int kv_step_idx = kv_idx_start; kv_step_idx < kv_idx_end; kv_step_idx++) {
+            // The barrier id.
+            int bar_id;
+            // Load paged kv input.
+            if constexpr (PAGED_KV_INPUT) {
+              bar_id = load_paged_kv(bidh_kv, kv_step_idx * STEP_KV, num_valid_kv_blocks,
+                                     params.paged_kv_cache.mTokensPerBlockLog2,
+                                     params.blocks_per_tma_load, params.blocks_per_tma_load_log2,
+                                     params.paged_kv_cache.mMaxBlocksPerSeq, paged_block_offsets,
+                                     desc_k, desc_v, shared, cbw_k, cbw_v, cbw_v_scratch);
+            } else {
+              bar_id = load_kv(bidh_kv, kv_step_idx * STEP_KV, desc_k, desc_v, shared, cbw_k, cbw_v,
+                               cbw_v_scratch);
+            }
+
+            // Opportunistically hide headinfo in the shadow of UTMALDGs of the QKV tensor
+            if (q_step_idx == 0 && kv_step_idx == kv_idx_start) {
+              // Send head info.
+              typename Shared::Head_info info{q_steps,
+                                              local_q_tile_offset,
+                                              USE_CUSTOM_MASK ? sum_mask_s : q_tile_offset,
+                                              kv_steps,
+                                              actual_q_seqlen,
+                                              actual_kv_seqlen,
+                                              sum_s_q_ * params.h + bidh,
+                                              bidh,
+                                              bidb};
+              headinfo_tracker0.template push_with_sync<SYNC_BARRIER, NUM_THREADS_IN_DMA_GROUP>(
+                  elect_one_, info);
+              headinfo_tracker1.template push_with_sync<SYNC_BARRIER, NUM_THREADS_IN_DMA_GROUP>(
+                  elect_one_, info);
+            }
+            if constexpr (DMA_GROUP_TRANSPOSE_V) {
+              transpose_v_tile(bar_id, shared, cbw_v, cbr_v_scratch);
+            }
+          }  // kv
+        }  // q
+
+        if (SCHEDULING_MODE == 0) {
+          tile_id_ += gridDim.y;
+        } else {
+          get_next_tile_id(local_wid, tiw, smem_tile_id, params.tile_id_counter_ptr);
+        }
+      }  // gridDim.y
+
+      // Signal compute groups to break.
+      headinfo_tracker0.template push_with_sync<SYNC_BARRIER, NUM_THREADS_IN_DMA_GROUP>(
+          elect_one_, {-1, -1, -1, -1, -1, -1, -1, -1});
+      headinfo_tracker1.template push_with_sync<SYNC_BARRIER, NUM_THREADS_IN_DMA_GROUP>(
+          elect_one_, {-1, -1, -1, -1, -1, -1, -1, -1});
+    }
+
+    // Load q tiles from gmem to smem by TMA.
+    template <typename BufferWriter, typename Smem_q>
+    inline __device__ void load_q(int bidh, int q_tile_start_offset, cudaTmaDesc const* desc_q,
+                                  Smem_q& smem_q, BufferWriter& cbw) {
+      int barrier_id = cbw.tmaReserve(elect_one_, TILE_SIZE_Q * Kernel_traits::ELEMENT_BYTES);
+
+      named_barrier_wait(SYNC_BARRIER, NUM_THREADS_IN_DMA_GROUP);
+
+      // split D into multiple groups in order to satisfy the TMA 128B sizzle mode
+#pragma unroll
+      for (int di = 0; di < Kernel_traits::D_GROUPS; ++di) {
+        const int32_t coords[3] = {di * Kernel_traits::D_PER_GROUP, bidh,
+                                   sum_s_q_ + q_tile_start_offset};
+        fmha::utmaldg<3, fmha::cudaTmaDescType::TILED, false>(
+            desc_q,
+            __cvta_generic_to_shared(
+                &smem_q[barrier_id * TILE_SIZE_Q + di * TILE_SIZE_Q_PER_D_GROUP]),
+            __cvta_generic_to_shared(cbw.barrier_ptr(barrier_id)), coords, elect_one_);
+      }
+    }
+
+#define PREPARE_KV_BUFFER()                                                                      \
+  int k_barrier_id = cbw_k.tmaReserve(elect_one_, (TILE_SIZE_K) * Kernel_traits::ELEMENT_BYTES); \
+                                                                                                 \
+  int v_barrier_id;                                                                              \
+  void* v_barrier_ptr;                                                                           \
+  typename Kernel_traits::Element_data_type* v_smem;                                             \
+                                                                                                 \
+  if constexpr (DMA_GROUP_TRANSPOSE_V) {                                                         \
+    v_barrier_id =                                                                               \
+        cbw_v_scratch.tmaReserve(elect_one_, (TILE_SIZE_V) * Kernel_traits::ELEMENT_BYTES);      \
+    v_barrier_ptr = cbw_v_scratch.barrier_ptr(v_barrier_id);                                     \
+    v_smem = shared->smem_v_scratch.data();                                                      \
+  } else {                                                                                       \
+    v_barrier_id = cbw_v.tmaReserve(elect_one_, (TILE_SIZE_V) * Kernel_traits::ELEMENT_BYTES);   \
+    v_barrier_ptr = cbw_v.barrier_ptr(v_barrier_id);                                             \
+    v_smem = shared->smem_v.data();                                                              \
+  }                                                                                              \
+                                                                                                 \
+  named_barrier_wait(SYNC_BARRIER, NUM_THREADS_IN_DMA_GROUP);
+
+    // Load k,v tiles from gmem to smem by TMA.
+    template <typename BufferWriter, typename BufferWriterScratch>
+    inline __device__ int load_kv(int bidh_kv, int kv_tile_start_offset, cudaTmaDesc const* desc_k,
+                                  cudaTmaDesc const* desc_v, Shared* shared, BufferWriter& cbw_k,
+                                  BufferWriter& cbw_v, BufferWriterScratch& cbw_v_scratch) {
+      PREPARE_KV_BUFFER()
+
+      // split D into multiple groups in order to satisfy the TMA 128B sizzle mode
+#pragma unroll
+      for (int di = 0; di < Kernel_traits::D_GROUPS; ++di) {
+        const int32_t k_coords[3] = {di * Kernel_traits::D_PER_GROUP, bidh_kv,
+                                     sum_s_kv_ + kv_tile_start_offset};
+
+        fmha::utmaldg<3, fmha::cudaTmaDescType::TILED, false>(
+            desc_k,
+            __cvta_generic_to_shared(
+                &shared->smem_k[k_barrier_id * TILE_SIZE_K + di * TILE_SIZE_K_PER_D_GROUP]),
+            __cvta_generic_to_shared(cbw_k.barrier_ptr(k_barrier_id)), k_coords, elect_one_);
+      }
+
+#pragma unroll
+      for (int di = 0; di < Kernel_traits::DV_GROUPS; ++di) {
+        const int32_t v_coords[3] = {di * Kernel_traits::D_PER_GROUP, bidh_kv,
+                                     sum_s_kv_ + kv_tile_start_offset};
+
+        fmha::utmaldg<3, fmha::cudaTmaDescType::TILED, false>(
+            desc_v,
+            __cvta_generic_to_shared(
+                &v_smem[v_barrier_id * TILE_SIZE_V + di * TILE_SIZE_V_PER_D_GROUP]),
+            __cvta_generic_to_shared(v_barrier_ptr), v_coords, elect_one_);
+      }
+
+      return v_barrier_id;
+    }
+
+    // Load paged k,v tiles from gmem to smem by TMA.
+    template <typename BufferWriter, typename BufferWriterScratch>
+    inline __device__ int load_paged_kv(int bidh_kv, int kv_tile_start_offset,
+                                        int num_valid_kv_blocks, int tokens_per_block_log2,
+                                        int blocks_per_tma_load, int blocks_per_tma_load_log2,
+                                        int max_blocks_per_sequence,
+                                        int32_t const* paged_block_offsets,
+                                        cudaTmaDesc const* desc_k, cudaTmaDesc const* desc_v,
+                                        Shared* shared, BufferWriter& cbw_k, BufferWriter& cbw_v,
+                                        BufferWriterScratch& cbw_v_scratch) {
+      PREPARE_KV_BUFFER()
+
+      // Paged KV cache block idx.
+      int paged_kv_block_idx = kv_tile_start_offset >> tokens_per_block_log2;
+      int kv_offset_in_block = kv_tile_start_offset & ((1 << tokens_per_block_log2) - 1);
+
+      // coordinates: d, s, h, 1
+      int const tile_size_k_per_block = TILE_SIZE_K_PER_D_GROUP >> blocks_per_tma_load_log2;
+      static_assert(TILE_SIZE_V_PER_D_GROUP == TILE_SIZE_K_PER_D_GROUP,
+                    "KV tile should have the same tensor size.");
+      for (int bi = 0; bi < blocks_per_tma_load; ++bi) {
+        int const bounded_block_idx = min(num_valid_kv_blocks - 1, paged_kv_block_idx + bi);
+
+        const int32_t k_paged_block_offset = paged_block_offsets[bounded_block_idx];
+        const int32_t v_paged_block_offset =
+            paged_block_offsets[max_blocks_per_sequence + bounded_block_idx];
+
+#pragma unroll
+        for (int di = 0; di < Kernel_traits::D_GROUPS; ++di) {
+          const int32_t k_coords[4] = {di * Kernel_traits::D_PER_GROUP, kv_offset_in_block, bidh_kv,
+                                       k_paged_block_offset};
+
+          fmha::utmaldg<4, fmha::cudaTmaDescType::TILED, false>(
+              desc_k,
+              __cvta_generic_to_shared(
+                  &shared->smem_k[k_barrier_id * TILE_SIZE_K + di * TILE_SIZE_K_PER_D_GROUP +
+                                  bi * tile_size_k_per_block]),
+              __cvta_generic_to_shared(cbw_k.barrier_ptr(k_barrier_id)), k_coords, elect_one_);
+        }
+
+#pragma unroll
+        for (int di = 0; di < Kernel_traits::DV_GROUPS; ++di) {
+          const int32_t v_coords[4] = {di * Kernel_traits::D_PER_GROUP, kv_offset_in_block, bidh_kv,
+                                       v_paged_block_offset};
+
+          fmha::utmaldg<4, fmha::cudaTmaDescType::TILED, false>(
+              desc_v,
+              __cvta_generic_to_shared(
+                  &v_smem[v_barrier_id * TILE_SIZE_V + di * TILE_SIZE_V_PER_D_GROUP +
+                          bi * tile_size_k_per_block]),
+              __cvta_generic_to_shared(v_barrier_ptr), v_coords, elect_one_);
+        }
+      }
+
+      return v_barrier_id;
+    }
+
+    template <typename BufferWriter, typename BufferReaderScratch>
+    // Transpose v tile explicitly as QGMMA doesn't support it.
+    inline __device__ void transpose_v_tile(int v_scratch_barrier_id, Shared* shared,
+                                            BufferWriter& cbw_v,
+                                            BufferReaderScratch& cbr_v_scratch) {
+      static_assert(NUM_THREADS_IN_DMA_GROUP == 128, "");
+      Transposer transposer(threadIdx.x % NUM_THREADS_IN_DMA_GROUP);
+
+      // Src buffer available
+      int ready = cbr_v_scratch.peek();
+      if (!ready) {
+        cbr_v_scratch.wait();
+      }
+      uint32_t smem_v_src = __cvta_generic_to_shared(&shared->smem_v_scratch[v_scratch_barrier_id]);
+
+      // Dst buffer available
+      int v_barrier_id = cbw_v.threadReserve();
+      uint32_t smem_v_dst = __cvta_generic_to_shared(&shared->smem_v[v_barrier_id * TILE_SIZE_V]);
+
+// Explicitly transpose the v buffer in smem for fp8.
+
+// The transposer currently has support of the following tile sizes:
+//   - D=32, S (or KV_STEP)=128
+//   - D=64, S (or KV_STEP)=64, 128
+//   - D=128, S (or KV_STEP)=64, 128
+// In addition, the transposer can only work with contiguous chunk of SMEM.
+//
+// For example, if V tile size is D=256 S=256, we can divide the TMA load of the V tile
+// (SxD) into 2x2 chunks of size 128x128. This way, when tiles (0, 0), (0, 1) are transposed,
+// either the load and the store of the data can be performed in a contiguous memory.
+//
+// Keep in mind in order to match GMMA requirement, we need to store the transposed tiles
+// along D dim first then S dim. Leading dimension S after the transpose is at most 128B.
+//
+// Logical:
+//         D  -  D I M  (contiguous)
+//
+//           128            128          S
+//     <------------> <------------>     -
+//     s, d = (0, 0) | s, d = (0, 1)     D
+//     ------------------------------    I
+//     s, d = (1, 0) | s, d = (1, 1)     M
+//
+// In SMEM:
+//                             D  -  D I M
+//
+//           128            128             128            128         S
+//     <------------> <-------------> <-------------> <------------>   -
+//     s, d = (0, 0) | s, d = (0, 1) | s, d = (1, 0) | s, d = (1, 1)   D  (contiguous)
+//                                                                     I
+//                                                                     M
+//
+#pragma unroll
+      for (int kgroup_idx = 0; kgroup_idx < Kernel_traits::BMM2_K_GROUPS; kgroup_idx++) {
+#pragma unroll
+        for (int dgroup_idx = 0; dgroup_idx < Kernel_traits::DV_GROUPS; dgroup_idx++) {
+          // Src smem block is k first then d
+          uint32_t src_offset =
+              (kgroup_idx * Kernel_traits::BMM2_K_PER_GROUP * Kernel_traits::D_PER_GROUP +
+               dgroup_idx * Kernel_traits::D_PER_GROUP * Kernel_traits::STEP_KV) *
+              Kernel_traits::ELEMENT_BYTES;
+
+          // Dst smem block is d first then k
+          uint32_t dst_offset =
+              (dgroup_idx * Kernel_traits::BMM2_K_PER_GROUP * Kernel_traits::D_PER_GROUP +
+               kgroup_idx * Kernel_traits::BMM2_K_PER_GROUP * Kernel_traits::DV) *
+              Kernel_traits::ELEMENT_BYTES;
+
+          transposer.template transpose_<false>(smem_v_src + src_offset, smem_v_dst + dst_offset);
+        }
+      }
+
+      fence_view_async_shared();                                   // Commit STSM
+      named_barrier_wait(SYNC_BARRIER, NUM_THREADS_IN_DMA_GROUP);  // Sync before signaling
+      cbw_v.threadCommit(elect_one_, v_barrier_id);                // Signal readiness
+      cbr_v_scratch.pop(elect_one_);                               // Advance to next phase
+    }
+
+    inline __device__ void get_next_tile_id(int local_wid, int tiw, uint32_t smem_tile_id,
+                                            uint32_t* tile_id_counter_ptr) {
+      if constexpr (DMA_GROUP_TRANSPOSE_V) {
+        if (elect_one_) {
+          tile_id_ = atomicAdd(tile_id_counter_ptr, 1);
+          sts(smem_tile_id, tile_id_);
+        }
+        fence_view_async_shared();
+        named_barrier_wait(SYNC_BARRIER, 128);
+        if (tiw == 0) {
+          lds(tile_id_, smem_tile_id);
+        }
+        tile_id_ = __shfl_sync(0xffffffff, tile_id_, 0);
+        // only one warp involved when the dma group doesn't need to transpose the v tile.
+      } else {
+        if (elect_one_) {
+          tile_id_ = atomicAdd(tile_id_counter_ptr, 1);
+        }
+        tile_id_ = __shfl_sync(0xffffffff, tile_id_, 0);
+      }
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////
+
+  struct Host {
+    Host() {}
+
+    // Set TMA descriptors on host, and launch as __grid_constant__.
+    // Paged KV FMHA parameters.
+    void init_params(bert::Fused_multihead_attention_params_v2& params,
+                     bert::Fused_multihead_attention_launch_params const& launch_params,
+                     cudaStream_t stream) const {
+      const uint32_t d = params.d;
+      const uint32_t dv = params.dv;
+      const uint32_t h = params.h;
+      const uint32_t h_kv = params.h_kv;
+
+      // Total sequence length.
+      const uint32_t total_seqlen =
+          params.is_s_padded ? (params.b * params.s) : launch_params.total_q_seqlen;
+
+      // O Layout: [total_seqlen, H, DV]
+      // Per batch tensor size.
+      uint32_t tensor_size_o[3] = {dv, h, total_seqlen};
+
+      // Stride size in bytes. Assumes least significant dim is 1
+      uint64_t tensor_stride_o[2] = {dv * Kernel_traits::ELEMENT_BYTES,
+                                     uint64_t(params.o_stride_in_bytes)};
+
+      // Starting memory address
+      char* o_ptr = reinterpret_cast<char*>(params.o_ptr);
+
+      // Box size of TMA
+      uint32_t box_size_o[3] = {Kernel_traits::D_PER_GROUP, 1, 16};
+
+      // Traversal stride.
+      uint32_t traversal_stride[3] = {1, 1, 1};
+
+      // OOB fill zeros.
+      uint32_t oob_fill = 0;
+
+      // FP32 to TF32 conversion disabled.
+      uint32_t fp32_to_tf32 = 0;
+
+      // GMMA descriptor mode.
+      static constexpr int D_BYTES_PER_GROUP = Kernel_traits::D_BYTES_PER_GROUP;
+      static constexpr fmha::cudaTmaDescSwizzle swizzle_mode =
+          (D_BYTES_PER_GROUP > 64   ? fmha::cudaTmaDescSwizzle::SWIZZLE_128B
+           : D_BYTES_PER_GROUP > 32 ? fmha::cudaTmaDescSwizzle::SWIZZLE_64B
+                                    : fmha::cudaTmaDescSwizzle::SWIZZLE_32B);
+
+      static_assert(STEP_KV <= 256 && STEP_Q <= 256, "max box size is 256");
+
+      // Desc Format (data type).
+      static constexpr fmha::cudaTmaDescFormat desc_format = (Kernel_traits::ELEMENT_BYTES == 1)
+                                                                 ? fmha::cudaTmaDescFormat::U8
+                                                                 : fmha::cudaTmaDescFormat::F16_RN;
+
+      fmha::Multiple_tma_descriptor<3> qo_tma_descriptor;
+
+      // TMA O
+      if (Kernel_traits::USE_TMA_STORE) {
+        qo_tma_descriptor.set_tma_desctriptor(
+            o_ptr, desc_format, fmha::cudaTmaDescInterleave::INTERLEAVE_DISABLED, swizzle_mode,
+            fmha::cudaTmaDescPromotion::PROMOTION_DISABLED, tensor_size_o, tensor_stride_o,
+            traversal_stride, box_size_o, oob_fill, fp32_to_tf32, &params.tma_desc_o);
+      }
+
+      auto const layout = launch_params.attention_input_layout;
+
+      // Q always uses 3D tensor
+      uint32_t tensor_size_q[3] = {d, h, total_seqlen};
+
+      uint64_t tensor_stride_q[2] = {d * Kernel_traits::ELEMENT_BYTES,
+                                     uint64_t(params.q_stride_in_bytes)};
+
+      char* q_ptr = reinterpret_cast<char*>(
+          layout == fmha::Attention_input_layout::PACKED_QKV ? params.qkv_ptr : params.q_ptr);
+
+      uint32_t box_size_q[3] = {Kernel_traits::D_PER_GROUP, 1, STEP_Q};
+
+      if (layout == fmha::Attention_input_layout::Q_PAGED_KV) {
+        // KV in q_paged_kv uses 4D tensor
+        // Layout: [INT32_MAX, H_KV, TokensPerBlock, D]
+        const uint32_t tokens_per_block = params.paged_kv_cache.mTokensPerBlock;
+        uint32_t tensor_size_k[4] = {d, tokens_per_block, h_kv, INT_MAX};
+        uint32_t tensor_size_v[4] = {dv, tokens_per_block, h_kv, INT_MAX};
+
+        uint64_t tensor_stride_k[3];
+        tensor_stride_k[0] = params.k_stride_in_bytes / tokens_per_block;  // d
+        tensor_stride_k[1] = params.k_stride_in_bytes;                     // d * 64
+        tensor_stride_k[2] = params.paged_kv_cache.mBytesPerBlock;
+        uint64_t tensor_stride_v[3];
+        // we cannot use dv * Kernel_traits::ELEMENT_BYTES because V may be padded (MLA)
+        tensor_stride_v[0] = params.v_stride_in_bytes / tokens_per_block;  // dv
+        tensor_stride_v[1] = params.v_stride_in_bytes;                     // dv * 64
+        tensor_stride_v[2] = params.paged_kv_cache.mBytesPerBlock;
+
+        char* kv_ptr = reinterpret_cast<char*>(params.paged_kv_cache.mPoolPtr);
+
+        uint32_t box_size_kv[4] = {Kernel_traits::D_PER_GROUP,
+                                   std::min<uint32_t>(tokens_per_block, STEP_KV), 1, 1};
+
+        assert(STEP_KV % tokens_per_block == 0 || tokens_per_block % STEP_KV == 0);
+        params.blocks_per_tma_load = std::max<uint32_t>(1, STEP_KV / tokens_per_block);
+        params.blocks_per_tma_load_log2 = log2(params.blocks_per_tma_load);
+
+        uint32_t traversal_stride[4] = {1, 1, 1, 1};
+
+        fmha::Multiple_tma_descriptor<4> kv_tma_descriptor;
+        // K
+        kv_tma_descriptor.set_tma_desctriptor(
+            kv_ptr, desc_format, fmha::cudaTmaDescInterleave::INTERLEAVE_DISABLED, swizzle_mode,
+            fmha::cudaTmaDescPromotion::PROMOTION_DISABLED, tensor_size_k, tensor_stride_k,
+            traversal_stride, box_size_kv, oob_fill, fp32_to_tf32, &params.tma_desc_k);
+        // V
+        kv_tma_descriptor.set_tma_desctriptor(
+            kv_ptr, desc_format, fmha::cudaTmaDescInterleave::INTERLEAVE_DISABLED, swizzle_mode,
+            fmha::cudaTmaDescPromotion::PROMOTION_DISABLED, tensor_size_v, tensor_stride_v,
+            traversal_stride, box_size_kv, oob_fill, fp32_to_tf32, &params.tma_desc_v);
+      } else {
+        // Otherwise KV uses 3D tensor
+        uint32_t tensor_size_k[3] = {d, h_kv, total_seqlen};
+        uint32_t tensor_size_v[3] = {dv, h_kv, total_seqlen};
+
+        uint64_t tensor_stride_k[2] = {d * Kernel_traits::ELEMENT_BYTES,
+                                       uint64_t(params.k_stride_in_bytes)};
+        uint64_t tensor_stride_v[2] = {dv * Kernel_traits::ELEMENT_BYTES,
+                                       uint64_t(params.v_stride_in_bytes)};
+
+        uint32_t box_size_kv[3] = {Kernel_traits::D_PER_GROUP, 1, STEP_KV};
+
+        char *k_ptr, *v_ptr;
+
+        if (layout == fmha::Attention_input_layout::PACKED_QKV) {
+          if (!HEADS_INTERLEAVED || h != h_kv) {
+            // Layout: [total_seqlen, (H, D) + (H_KV, D) + (H_KV, DV)]
+            // All of MHA in TRTLLM is in this layout,
+            // and MQA/GQA must use this layout.
+            k_ptr = q_ptr + h * d * Kernel_traits::ELEMENT_BYTES;
+            v_ptr = k_ptr + h_kv * d * Kernel_traits::ELEMENT_BYTES;
+          } else {
+            // Layout: [total_seqlen, H, D + D + DV]
+            // Currently only used in MHA in fmha_v2 tests.
+            tensor_stride_q[0] = tensor_stride_k[0] = tensor_stride_v[0] =
+                (2 * d + dv) * Kernel_traits::ELEMENT_BYTES;
+            k_ptr = q_ptr + d * Kernel_traits::ELEMENT_BYTES;
+            v_ptr = k_ptr + d * Kernel_traits::ELEMENT_BYTES;
+          }
+        } else if (layout == fmha::Attention_input_layout::CONTIGUOUS_Q_KV) {
+          k_ptr = reinterpret_cast<char*>(params.kv_ptr);
+          v_ptr = k_ptr + h_kv * d * Kernel_traits::ELEMENT_BYTES;
+        } else if (layout == fmha::Attention_input_layout::SEPARATE_Q_K_V) {
+          k_ptr = reinterpret_cast<char*>(params.k_ptr);
+          v_ptr = reinterpret_cast<char*>(params.v_ptr);
+        }
+
+        fmha::Multiple_tma_descriptor<3> kv_tma_descriptor;
+        // K
+        kv_tma_descriptor.set_tma_desctriptor(
+            k_ptr, desc_format, fmha::cudaTmaDescInterleave::INTERLEAVE_DISABLED, swizzle_mode,
+            fmha::cudaTmaDescPromotion::PROMOTION_DISABLED, tensor_size_k, tensor_stride_k,
+            traversal_stride, box_size_kv, oob_fill, fp32_to_tf32, &params.tma_desc_k);
+        // V
+        kv_tma_descriptor.set_tma_desctriptor(
+            v_ptr, desc_format, fmha::cudaTmaDescInterleave::INTERLEAVE_DISABLED, swizzle_mode,
+            fmha::cudaTmaDescPromotion::PROMOTION_DISABLED, tensor_size_v, tensor_stride_v,
+            traversal_stride, box_size_kv, oob_fill, fp32_to_tf32, &params.tma_desc_v);
+      }
+      // Q
+      qo_tma_descriptor.set_tma_desctriptor(
+          q_ptr, desc_format, fmha::cudaTmaDescInterleave::INTERLEAVE_DISABLED, swizzle_mode,
+          fmha::cudaTmaDescPromotion::PROMOTION_DISABLED, tensor_size_q, tensor_stride_q,
+          traversal_stride, box_size_q, oob_fill, fp32_to_tf32, &params.tma_desc_q);
+    }
+  };
+};
+
+}  // namespace ws
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/warpspec/epilogue.h b/csrc/fmha_v2/fmha/warpspec/epilogue.h
new file mode 100644
index 0000000000..15f8636207
--- /dev/null
+++ b/csrc/fmha_v2/fmha/warpspec/epilogue.h
@@ -0,0 +1,1091 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/softmax.h>
+#include <fmha/traits.h>
+#include <fmha/utils.h>
+
+namespace fmha {
+namespace ws {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Special Softmax struct to handle optimization tricks on Hopper Warp-Specialized Kernels.
+template <template <int, int, int, bool, bool> class Traits, typename Kernel_traits>
+struct Softmax_base {
+  // The instruction traits for BMM1.
+  using Traits_p = typename Kernel_traits::Traits_p;
+  // The instruction traits for BMM2.
+  using Traits_o = typename Kernel_traits::Traits_o;
+
+  // The CTA description for BMM1.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  // The CTA description for BMM2.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The GMMA compute tile for BMM1.
+  using Compute_tile_p = typename Kernel_traits::Compute_tile_p;
+  // The GMMA compute tile for BMM2.
+  using Compute_tile_o = typename Kernel_traits::Compute_tile_o;
+
+  // The MMA tile for the BMM1.
+  using Mma_tile_p = typename Kernel_traits::Mma_tile_p;
+  // The MMA tile for the BMM2.
+  using Mma_tile_o = typename Kernel_traits::Mma_tile_o;
+
+  // The fragment of BMM1 output.
+  using Fragment_p = typename Compute_tile_o::Fragment;
+
+  // The step size of KV loop.
+  enum { STEP_KV = Kernel_traits::STEP_KV };
+
+  // Whether apply causal mask or not.
+  enum { CAUSAL_MASK = Kernel_traits::CAUSAL_MASK };
+
+  // Whether do we attend to the specific sliding window or chunk ?
+  enum { SLIDING_OR_CHUNKED_ATTENTION = Kernel_traits::SLIDING_OR_CHUNKED_ATTENTION };
+
+  // Are we applying alibi bias (drop FMA optimizations for accuracy reasons).
+  enum { APPLY_ALIBI = Kernel_traits::APPLY_ALIBI };
+
+  // Are we applying softcapping scale for qk products ?
+  enum { ENABLE_BMM1_SOFTCAPPING_SCALE = Kernel_traits::ENABLE_BMM1_SOFTCAPPING_SCALE };
+
+  // Do we use custom mask input ?
+  enum { USE_CUSTOM_MASK = Kernel_traits::USE_CUSTOM_MASK };
+
+  // Apply the exp2f optimization (fuse bmm1_scale and -max into FMAs).
+  enum { EXP2F_OPTIMIZATION = Kernel_traits::EXP2F_OPTIMIZATION };
+
+  // Whether we need to check if local_max could be -inf or not.
+  enum { CHECK_IF_NEG_INF_EXISTS = SLIDING_OR_CHUNKED_ATTENTION || USE_CUSTOM_MASK };
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax_base(Params params, int tidx)
+      : tidx_(tidx),
+        scale_bmm1_(params.scale_bmm1_d ? *params.scale_bmm1_d : params.scale_bmm1),
+        softcapping_scale_bmm1_(params.softcapping_scale_bmm1),
+        sliding_window_size_(params.sliding_window_size),
+        log2_chunked_attention_size_(params.log2_chunked_attention_size),
+        packed_mask_ptr_{reinterpret_cast<uint32_t*>(params.packed_mask_ptr)},
+        params_packed_mask_stride_in_bytes_{params.packed_mask_stride_in_bytes} {
+    int warp = tidx / 32;
+    int lane = tidx % 32;
+    // The corresponding row/col for each thread after MMA.
+    // fixed 4x1 warp layout.
+    quad_col_ = lane % 4;
+    if (CAUSAL_MASK) {
+      quad_row_ = warp * 16 + lane / 4;
+    }
+  }
+
+  // Compute the sliding window or chunk start.
+  inline __device__ int compute_sliding_window_or_chunk_start(int row) {
+    // If the chunked atteniton is used.
+    if (log2_chunked_attention_size_ > 0) {
+      // The attention chunk start.
+      return (row >> log2_chunked_attention_size_) << log2_chunked_attention_size_;
+    } else {
+      // The sliding window start is the max of 0 and row - sliding_window_size.
+      return max(0, row + 1 - sliding_window_size_);
+    }
+  }
+
+  // Load the packed mask in global memory.
+  inline __device__ void load_packed_mask(int row_offset, int col_offset) {
+    if constexpr (USE_CUSTOM_MASK) {
+      static_assert(Mma_tile_p::CORES_M == 2, "Not implemented!");
+      // Note that row_offset takes sum_s into consideration.
+      // row_offset % 64 == 0.
+      // 32 bits per thread, so 128 bits (32 bytes) per mma row (4 threads).
+      int64_t mask_row_offset_in_bytes = row_offset * params_packed_mask_stride_in_bytes_;
+      // offset_in_bytes = (tidx_ * 32 + (col_offset / (16 * 4)) * 128 * 32) / 8.
+      // note that col_offset % 64 == 0 here.
+      int64_t mask_col_offset_in_bytes = tidx_ * 4 + col_offset * Cta_tile_p::THREADS_PER_CTA / 16;
+      // add the two offsets for uint32 packed mask.
+      int64_t mask_offset = (mask_row_offset_in_bytes + mask_col_offset_in_bytes) / 4;
+      if constexpr (STEP_KV == 64) {
+        // 32 bits (2 rows, 16 cols) are needed.
+        packed_mask_.x = packed_mask_ptr_[mask_offset];
+      } else if constexpr (STEP_KV == 128) {
+        // 2 x 32 bits (4 rows, 16 cols) are needed.
+        packed_mask_.x = packed_mask_ptr_[mask_offset];
+        packed_mask_.y = packed_mask_ptr_[mask_offset + 128];
+      } else if constexpr (STEP_KV == 256) {
+        // 4 x 32 bits (4 rows, 16 cols) are needed.
+        packed_mask_.x = packed_mask_ptr_[mask_offset];
+        packed_mask_.y = packed_mask_ptr_[mask_offset + 128];
+        packed_mask_.z = packed_mask_ptr_[mask_offset + 256];
+        packed_mask_.w = packed_mask_ptr_[mask_offset + 384];
+      }
+    }
+  }
+
+  // Check if the two positions are valid or not.
+  inline __device__ void valid_positions(int mi, int ni, bool& v0, bool& v1) {
+    // Only need one uint32_t packed mask in this case.
+    if constexpr (STEP_KV == 64) {
+      // Packed mask input.
+      v0 = packed_mask_.x & (1 << (ni * 4 + mi * 2 + 0));
+      v1 = packed_mask_.x & (1 << (ni * 4 + mi * 2 + 1));
+    } else if constexpr (STEP_KV == 128) {
+      // Packed mask input.
+      if (ni < 8) {
+        v0 = packed_mask_.x & (1 << (ni * 4 + mi * 2 + 0));
+        v1 = packed_mask_.x & (1 << (ni * 4 + mi * 2 + 1));
+      } else {
+        v0 = packed_mask_.y & (1 << (ni * 4 + mi * 2 + 0 - 32));
+        v1 = packed_mask_.y & (1 << (ni * 4 + mi * 2 + 1 - 32));
+      }
+    } else if constexpr (STEP_KV == 256) {
+      // KV step size is 256 in this case (i.e CORES_N = 32).
+      if (ni < 8) {
+        v0 = packed_mask_.x & (1 << (ni * 4 + mi * 2 + 0));
+        v1 = packed_mask_.x & (1 << (ni * 4 + mi * 2 + 1));
+      } else if (ni < 16) {
+        v0 = packed_mask_.y & (1 << (ni * 4 + mi * 2 + 0 - 32));
+        v1 = packed_mask_.y & (1 << (ni * 4 + mi * 2 + 1 - 32));
+      } else if (ni < 24) {
+        v0 = packed_mask_.z & (1 << (ni * 4 + mi * 2 + 0 - 64));
+        v1 = packed_mask_.z & (1 << (ni * 4 + mi * 2 + 1 - 64));
+      } else {
+        v0 = packed_mask_.w & (1 << (ni * 4 + mi * 2 + 0 - 96));
+        v1 = packed_mask_.w & (1 << (ni * 4 + mi * 2 + 1 - 96));
+      }
+    }
+  }
+
+  // Convert from bmm1 output fragments to floats.
+  inline __device__ void unpack(Compute_tile_p& ctile_p) {
+#pragma unroll
+    for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++) {
+#pragma unroll
+      for (int ni = 0; ni < Mma_tile_p::CORES_N; ni++) {
+        // Satfinite in case of overflow (due to fp16 accumulation).
+        // When there is no alibi bias, we fuse bmm1_scale with -max by FMAs.
+        uint32_t scaled_h2 =
+            EXP2F_OPTIMIZATION
+                ? satfinite_h2(ctile_p.acc_[0][0].reg(ni * Mma_tile_p::CORES_M + mi))
+                : satfinite_h2(
+                      hmul2(ctile_p.acc_[0][0].reg(ni * Mma_tile_p::CORES_M + mi), scale_bmm1_));
+        // Convert from half2 to float2.
+        reinterpret_cast<float2*>(&elt_[mi][2 * ni])[0] = half2_to_float2(scaled_h2);
+      }
+    }
+  }
+
+  // Convert from bmm1 output fragments to floats.
+  template <bool APPLY_MASK, typename AlibiParams>
+  inline __device__ void apply_alibi_and_mask(Compute_tile_p& ctile_p,
+                                              AlibiParams const& alibi_params,
+                                              float const alibi_head_scale, int actual_seqlen,
+                                              int row_offset, int col_offset) {
+#pragma unroll
+    for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++) {
+#pragma unroll
+      for (int ni = 0; ni < Mma_tile_p::CORES_N; ni++) {
+        bool v0 = true, v1 = true;
+        int col = 0;
+        if constexpr (APPLY_MASK) {
+          // Custom mask input.
+          if constexpr (USE_CUSTOM_MASK) {
+            // Packed mask input.
+            valid_positions(mi, ni, v0, v1);
+            // Causal mask.
+          } else if constexpr (CAUSAL_MASK) {
+            // Causal Mask: we have to apply mask before getting max.
+            int row = row_offset + quad_row_ + mi * 8;
+            col = col_offset + quad_col_ * 2 + ni * 8;
+            // Mask for the two N elements.
+            v0 = (col <= row);
+            v1 = (col + 1 <= row);
+
+            // Attend to the specific sliding window or chunk.
+            if constexpr (SLIDING_OR_CHUNKED_ATTENTION) {
+              int sliding_window_or_chunk_start = compute_sliding_window_or_chunk_start(row);
+              v0 &= (col >= sliding_window_or_chunk_start);
+              v1 &= (col + 1 >= sliding_window_or_chunk_start);
+            }
+            // Dense(padding) mask.
+          } else {
+            col = col_offset + quad_col_ * 2 + ni * 8;
+            v0 = (col < actual_seqlen);
+            v1 = (col + 1 < actual_seqlen);
+          }
+        }
+
+        // The unpacked floats from the array.
+        float2& f2 = reinterpret_cast<float2*>(&elt_[mi][2 * ni])[0];
+
+        // Attention logit softcapping scale.
+        // 1.0f / softcapping_scale has been fused into scale_bmm1.
+        if constexpr (ENABLE_BMM1_SOFTCAPPING_SCALE) {
+          f2.x = softcapping_scale_bmm1_ * fmha::__tanhf(f2.x);
+          f2.y = softcapping_scale_bmm1_ * fmha::__tanhf(f2.y);
+        }
+
+        // Use minimum value of float here to avoid generating NANs with expf.
+        if constexpr (APPLY_ALIBI) {
+          f2.x = v0 ? (f2.x * alibi_params.scale_after_alibi +
+                       (col + alibi_params.sequence_pos_offset) * alibi_head_scale)
+                    : -FLT_MAX;
+          f2.y = v1 ? (f2.y * alibi_params.scale_after_alibi +
+                       (col + 1 + alibi_params.sequence_pos_offset) * alibi_head_scale)
+                    : -FLT_MAX;
+        } else {
+          f2.x = v0 ? f2.x : -FLT_MAX;
+          f2.y = v1 ? f2.y : -FLT_MAX;
+        }
+      }
+    }
+  }
+
+  // Calculate max/sum, and update flash-attention scales.
+  template <bool IS_FIRST_COL>
+  inline __device__ void compute_and_update_scale(float (&global_max)[Mma_tile_p::CORES_M],
+                                                  float (&global_sum)[Mma_tile_p::CORES_M]) {
+    float const scale = reinterpret_cast<float const&>(scale_bmm1_);
+
+// Row-wise max of current tile.
+#pragma unroll
+    for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++) {
+      if (IS_FIRST_COL) {
+        local_max_[mi] = elt_[mi][0];
+      } else {
+        local_max_[mi] = fmaxf(global_max[mi], elt_[mi][0]);
+      }
+#pragma unroll
+      for (int ni = 1; ni < Mma_tile_p::CORES_N * 2; ni++) {
+        local_max_[mi] = fmaxf(local_max_[mi], elt_[mi][ni]);
+      }
+      local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 1), local_max_[mi]);
+      local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 2), local_max_[mi]);
+    }
+
+// Softmax Exp.
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile_p::CORES_N; ni++) {
+#pragma unroll
+      for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++) {
+        float& p0 = elt_[mi][2 * ni + 0];
+        float& p1 = elt_[mi][2 * ni + 1];
+
+        // When all elts of the tile are -FLT_MAX, we have to make sure
+        //  expf generates 0 for all values instead of 1.
+        if constexpr (!EXP2F_OPTIMIZATION) {
+          float masked_max =
+              (!CHECK_IF_NEG_INF_EXISTS || local_max_[mi] != -FLT_MAX) ? local_max_[mi] : 0.f;
+          p0 = expf(p0 - masked_max);
+          p1 = expf(p1 - masked_max);
+        } else {
+          // Use exp2f optimization for cases without alibi.
+          float masked_max = (!CHECK_IF_NEG_INF_EXISTS || local_max_[mi] != -FLT_MAX)
+                                 ? local_max_[mi] * scale
+                                 : 0.f;
+          p0 = custom_exp2f(p0, scale, masked_max);
+          p1 = custom_exp2f(p1, scale, masked_max);
+        }
+      }
+    }
+
+// Row-wise sum of current tile.
+#pragma unroll
+    for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++) {
+      local_sum_[mi] = elt_[mi][0];
+#pragma unroll
+      for (int ni = 1; ni < Mma_tile_p::CORES_N * 2; ni++) {
+        local_sum_[mi] += elt_[mi][ni];
+      }
+      local_sum_[mi] += __shfl_xor_sync(uint32_t(-1), local_sum_[mi], 1);
+      local_sum_[mi] += __shfl_xor_sync(uint32_t(-1), local_sum_[mi], 2);
+    }
+
+    // Initialize or update the global sum and max.
+    if (IS_FIRST_COL) {
+#pragma unroll
+      for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++) {
+        global_sum[mi] = local_sum_[mi];
+        global_max[mi] = local_max_[mi];
+      }
+    } else {
+#pragma unroll
+      for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++) {
+        float max_old = global_max[mi];
+        float max_new = local_max_[mi];
+        float sum_old = global_sum[mi];
+        float sum_new = local_sum_[mi];
+        // Remove the old max and replace by the new one.
+        if constexpr (!EXP2F_OPTIMIZATION) {
+          correction_[mi] = expf(max_old - max_new);
+        } else {
+          // Use exp2f optimization for cases without alibi.
+          correction_[mi] = exp2f((max_old - max_new) * scale);
+        }
+        global_sum[mi] = sum_old * correction_[mi] + sum_new;
+
+        // New max already takes into account old max.
+        global_max[mi] = max_new;
+      }
+    }
+  }
+
+  // Update flash attention scales and pack elements for BMM2.
+  template <bool IS_FIRST_COL>
+  inline __device__ void pack(Compute_tile_o& ctile_o, Fragment_p (&frag_p)[Mma_tile_o::MMAS_K]) {
+// Pack 4 cols for BMM2 A tile.
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile_o::MMAS_K; ni++) {
+      frag_p[ni].reg(0) = float2_to_half2(elt_[0][4 * ni + 0], elt_[0][4 * ni + 1]);
+      frag_p[ni].reg(1) = float2_to_half2(elt_[1][4 * ni + 0], elt_[1][4 * ni + 1]);
+      frag_p[ni].reg(2) = float2_to_half2(elt_[0][4 * ni + 2], elt_[0][4 * ni + 3]);
+      frag_p[ni].reg(3) = float2_to_half2(elt_[1][4 * ni + 2], elt_[1][4 * ni + 3]);
+    }
+
+    if (!IS_FIRST_COL) {
+// Correct accumulators to current max.
+#pragma unroll
+      for (int mi = 0; mi < Mma_tile_o::CORES_M; mi++) {
+        const uint32_t scale = float_to_half2(correction_[mi]);
+
+// Assume only N has multiple MMAs (MMAS_M = 1).
+// MMAS_N > 1 when N dimension is split.
+#pragma unroll
+        for (int mma_ni = 0; mma_ni < Mma_tile_o::MMAS_N; mma_ni++) {
+#pragma unroll
+          for (int ni = 0; ni < Mma_tile_o::CORES_N; ni++) {
+            uint32_t& reg = ctile_o.acc_[0][mma_ni].reg(ni * Mma_tile_o::CORES_M + mi);
+            reg = hmul2(reg, scale);
+          }
+        }
+      }
+    } else {
+      ctile_o.clear();
+    }
+  }
+
+  // BMM1 scale.
+  const uint32_t scale_bmm1_;
+  // BMM1 softcapping scale.
+  float const softcapping_scale_bmm1_;
+
+  // The sliding window size.
+  int const sliding_window_size_;
+  // The log2 attention chunk size.
+  int const log2_chunked_attention_size_;
+
+  // The thread idx in the warp group.
+  int tidx_;
+  // The col index for the mma thread layout.
+  int quad_col_;
+  // The row index for the mma thread layout.
+  int quad_row_;
+
+  // The packed mask ptr.
+  uint32_t const* packed_mask_ptr_;
+  // The packed mask k-dim stride in bytes;
+  const int64_t params_packed_mask_stride_in_bytes_;
+
+  // Unpacked BMM1 output buffer.
+  float elt_[Mma_tile_p::CORES_M][Mma_tile_p::CORES_N * 2];
+  // Local max.
+  float local_max_[Mma_tile_p::CORES_M];
+  // Local sum.
+  float local_sum_[Mma_tile_p::CORES_M];
+  // Correction_ scales for ctil_o.
+  float correction_[Mma_tile_p::CORES_M];
+  // The packed mask.
+  uint4 packed_mask_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <template <int, int, int, bool, bool> class Traits, typename Kernel_traits>
+struct Softmax {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Hopper_hgmma_fp16_traits
+template <typename Kernel_traits>
+struct Softmax<Hopper_hgmma_fp16_traits, Kernel_traits>
+    : public Softmax_base<Hopper_hgmma_fp16_traits, Kernel_traits> {
+  // The Base class.
+  using Base = Softmax_base<Hopper_hgmma_fp16_traits, Kernel_traits>;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, int tidx) : Base(params, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Fp32 accumulation traits.
+template <template <int, int, int, bool, bool> class Traits, typename Kernel_traits>
+struct Softmax_fp32_base : public Softmax_base<Traits, Kernel_traits> {
+  // The Base class.
+  using Base = Softmax_base<Traits, Kernel_traits>;
+
+  // The instruction traits for BMM1.
+  using Traits_p = typename Base::Traits_p;
+  // The instruction traits for BMM2.
+  using Traits_o = typename Base::Traits_o;
+
+  // The CTA description for BMM1.
+  using Cta_tile_p = typename Base::Cta_tile_p;
+  // The CTA description for BMM2.
+  using Cta_tile_o = typename Base::Cta_tile_o;
+
+  // The GMMA compute tile for BMM1.
+  using Compute_tile_p = typename Base::Compute_tile_p;
+  // The GMMA compute tile for BMM2.
+  using Compute_tile_o = typename Base::Compute_tile_o;
+
+  // The MMA tile for the BMM1.
+  using Mma_tile_p = typename Base::Mma_tile_p;
+  // The MMA tile for the BMM2.
+  using Mma_tile_o = typename Base::Mma_tile_o;
+
+  // The fragment of BMM1 output.
+  using Fragment_p = typename Compute_tile_o::Fragment;
+
+  // Whether apply causal mask or not.
+  enum { CAUSAL_MASK = Base::CAUSAL_MASK };
+
+  // Do we use custom mask input ?
+  enum { USE_CUSTOM_MASK = Base::USE_CUSTOM_MASK };
+
+  // Whether we attend to the specific sliding window or chunk ?
+  enum { SLIDING_OR_CHUNKED_ATTENTION = Base::SLIDING_OR_CHUNKED_ATTENTION };
+
+  // Are we applying alibi bias (drop FMA optimizations for accuracy reasons).
+  enum { APPLY_ALIBI = Base::APPLY_ALIBI };
+
+  // Are we applying softcapping_scale for qk products ?
+  enum { ENABLE_BMM1_SOFTCAPPING_SCALE = Base::ENABLE_BMM1_SOFTCAPPING_SCALE };
+
+  // Apply the exp2f optimization (fuse bmm1_scale and -max into FMAs).
+  enum { EXP2F_OPTIMIZATION = Base::EXP2F_OPTIMIZATION };
+
+  // Whether we need to check if local_max could be -inf or not.
+  enum { CHECK_IF_NEG_INF_EXISTS = Base::CHECK_IF_NEG_INF_EXISTS };
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax_fp32_base(Params const& params, int tidx) : Base(params, tidx) {}
+
+  // Convert from bmm1 output fragments to floats.
+  inline __device__ void unpack(Compute_tile_p& ctile_p) {
+#pragma unroll
+    for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++) {
+#pragma unroll
+      for (int ni = 0; ni < Mma_tile_p::CORES_N; ni++) {
+        float2 f2;
+        f2.x = ctile_p.acc_[0][0].elt(2 * ni * Mma_tile_p::CORES_M + 2 * mi);
+        f2.y = ctile_p.acc_[0][0].elt(2 * ni * Mma_tile_p::CORES_M + 2 * mi + 1);
+
+        float const scale = reinterpret_cast<float const&>(this->scale_bmm1_);
+        f2.x = EXP2F_OPTIMIZATION ? f2.x : f2.x * scale;
+        f2.y = EXP2F_OPTIMIZATION ? f2.y : f2.y * scale;
+
+        // Store to elt array.
+        reinterpret_cast<float2*>(&this->elt_[mi][2 * ni])[0] = f2;
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Hopper_hgmma_fp32_traits
+template <typename Kernel_traits>
+struct Softmax<Hopper_hgmma_fp32_traits, Kernel_traits>
+    : public Softmax_fp32_base<Hopper_hgmma_fp32_traits, Kernel_traits> {
+  // The Base class.
+  using Base = Softmax_fp32_base<Hopper_hgmma_fp32_traits, Kernel_traits>;
+
+  // The GMMA compute tile for BMM2.
+  using Compute_tile_o = typename Base::Compute_tile_o;
+
+  // The MMA tile for the BMM2.
+  using Mma_tile_o = typename Base::Mma_tile_o;
+
+  // The fragment of BMM1 output.
+  using Fragment_p = typename Compute_tile_o::Fragment;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, int tidx) : Base(params, tidx) {}
+
+  // Update flash attention scales and pack elements for BMM2.
+  template <bool IS_FIRST_COL>
+  inline __device__ void pack(Compute_tile_o& ctile_o, Fragment_p (&frag_p)[Mma_tile_o::MMAS_K]) {
+// Pack 4 cols for BMM2 A tile.
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile_o::MMAS_K; ni++) {
+      frag_p[ni].reg(0) = float2_to_half2(this->elt_[0][4 * ni + 0], this->elt_[0][4 * ni + 1]);
+      frag_p[ni].reg(1) = float2_to_half2(this->elt_[1][4 * ni + 0], this->elt_[1][4 * ni + 1]);
+      frag_p[ni].reg(2) = float2_to_half2(this->elt_[0][4 * ni + 2], this->elt_[0][4 * ni + 3]);
+      frag_p[ni].reg(3) = float2_to_half2(this->elt_[1][4 * ni + 2], this->elt_[1][4 * ni + 3]);
+    }
+
+    if (!IS_FIRST_COL) {
+// Correct accumulators to current max.
+#pragma unroll
+      for (int mi = 0; mi < Mma_tile_o::CORES_M; mi++) {
+        // Assume only N has multiple MMAs (MMAS_M = 1).
+        // MMAS_N > 1 when N dimension is split.
+        float correction = this->correction_[mi];
+#pragma unroll
+        for (int mma_ni = 0; mma_ni < Mma_tile_o::MMAS_N; mma_ni++) {
+#pragma unroll
+          for (int ni = 0; ni < Mma_tile_o::CORES_N; ni++) {
+            uint32_t& reg0 = ctile_o.acc_[0][mma_ni].reg(2 * ni * Mma_tile_o::CORES_M + 2 * mi);
+            uint32_t& reg1 = ctile_o.acc_[0][mma_ni].reg(2 * ni * Mma_tile_o::CORES_M + 2 * mi + 1);
+            asm volatile("mul.f32 %0, %0, %1;\n" : "+r"(reg0) : "f"(correction));
+            asm volatile("mul.f32 %0, %0, %1;\n" : "+r"(reg1) : "f"(correction));
+          }
+        }
+      }
+    } else {
+      ctile_o.clear();
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Hopper_hgmma_bf16_traits
+template <typename Kernel_traits>
+struct Softmax<Hopper_hgmma_bf16_traits, Kernel_traits>
+    : public Softmax_fp32_base<Hopper_hgmma_bf16_traits, Kernel_traits> {
+  // The Base class.
+  using Base = Softmax_fp32_base<Hopper_hgmma_bf16_traits, Kernel_traits>;
+
+  // The GMMA compute tile for BMM2.
+  using Compute_tile_o = typename Base::Compute_tile_o;
+
+  // The MMA tile for the BMM2.
+  using Mma_tile_o = typename Base::Mma_tile_o;
+
+  // The fragment of BMM1 output.
+  using Fragment_p = typename Compute_tile_o::Fragment;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, int tidx) : Base(params, tidx) {}
+
+  // Update flash attention scales and pack elements for BMM2.
+  template <bool IS_FIRST_COL>
+  inline __device__ void pack(Compute_tile_o& ctile_o, Fragment_p (&frag_p)[Mma_tile_o::MMAS_K]) {
+// Pack 4 cols for BMM2 A tile.
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile_o::MMAS_K; ni++) {
+      frag_p[ni].reg(0) = float2_to_bf16_x2(this->elt_[0][4 * ni + 0], this->elt_[0][4 * ni + 1]);
+      frag_p[ni].reg(1) = float2_to_bf16_x2(this->elt_[1][4 * ni + 0], this->elt_[1][4 * ni + 1]);
+      frag_p[ni].reg(2) = float2_to_bf16_x2(this->elt_[0][4 * ni + 2], this->elt_[0][4 * ni + 3]);
+      frag_p[ni].reg(3) = float2_to_bf16_x2(this->elt_[1][4 * ni + 2], this->elt_[1][4 * ni + 3]);
+    }
+
+    if (!IS_FIRST_COL) {
+// Correct accumulators to current max.
+#pragma unroll
+      for (int mi = 0; mi < Mma_tile_o::CORES_M; mi++) {
+        // Assume only N has multiple MMAs (MMAS_M = 1).
+        // MMAS_N > 1 when N dimension is split.
+        float correction = this->correction_[mi];
+#pragma unroll
+        for (int mma_ni = 0; mma_ni < Mma_tile_o::MMAS_N; mma_ni++) {
+#pragma unroll
+          for (int ni = 0; ni < Mma_tile_o::CORES_N; ni++) {
+            uint32_t& reg0 = ctile_o.acc_[0][mma_ni].reg(2 * ni * Mma_tile_o::CORES_M + 2 * mi);
+            uint32_t& reg1 = ctile_o.acc_[0][mma_ni].reg(2 * ni * Mma_tile_o::CORES_M + 2 * mi + 1);
+            asm volatile("mul.f32 %0, %0, %1;\n" : "+r"(reg0) : "f"(correction));
+            asm volatile("mul.f32 %0, %0, %1;\n" : "+r"(reg1) : "f"(correction));
+          }
+        }
+      }
+    } else {
+      ctile_o.clear();
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Hopper_qgmma_e4m3_fp32_traits
+template <typename Kernel_traits>
+struct Softmax<Hopper_qgmma_e4m3_fp32_traits, Kernel_traits>
+    : public Softmax_fp32_base<Hopper_qgmma_e4m3_fp32_traits, Kernel_traits> {
+  // The Base class.
+  using Base = Softmax_fp32_base<Hopper_qgmma_e4m3_fp32_traits, Kernel_traits>;
+
+  // The instruction traits for BMM1.
+  using Traits_p = typename Base::Traits_p;
+  // The instruction traits for BMM2.
+  using Traits_o = typename Base::Traits_o;
+
+  // The CTA description for BMM1.
+  using Cta_tile_p = typename Base::Cta_tile_p;
+  // The CTA description for BMM2.
+  using Cta_tile_o = typename Base::Cta_tile_o;
+
+  // The GMMA compute tile for BMM1.
+  using Compute_tile_p = typename Base::Compute_tile_p;
+  // The GMMA compute tile for BMM2.
+  using Compute_tile_o = typename Base::Compute_tile_o;
+
+  // The MMA tile for the BMM1.
+  using Mma_tile_p = typename Base::Mma_tile_p;
+  // The MMA tile for the BMM2.
+  using Mma_tile_o = typename Base::Mma_tile_o;
+
+  // The fragment of BMM1 output.
+  using Fragment_p = typename Compute_tile_o::Fragment;
+
+  // Whether apply causal mask or not.
+  enum { CAUSAL_MASK = Base::CAUSAL_MASK };
+
+  // Whether we attend to the specific sliding window or chunk ?
+  enum { SLIDING_OR_CHUNKED_ATTENTION = Base::SLIDING_OR_CHUNKED_ATTENTION };
+
+  // Are we applying alibi bias (drop FMA optimizations for accuracy reasons).
+  enum { APPLY_ALIBI = Base::APPLY_ALIBI };
+
+  // Apply the exp2f optimization (fuse bmm1_scale and -max into FMAs).
+  enum { EXP2F_OPTIMIZATION = Base::EXP2F_OPTIMIZATION };
+
+  // Are we applying softcapping_scale for qk products ?
+  enum { ENABLE_BMM1_SOFTCAPPING_SCALE = Base::ENABLE_BMM1_SOFTCAPPING_SCALE };
+
+  // Whether we need to check if local_max could be -inf or not.
+  enum { CHECK_IF_NEG_INF_EXISTS = Base::CHECK_IF_NEG_INF_EXISTS };
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(Params const& params, int tidx) : Base(params, tidx) {}
+
+  // Calculate max/sum, and update flash-attention scales.
+  template <bool IS_FIRST_COL>
+  inline __device__ void compute_and_update_scale(float (&global_max)[Mma_tile_p::CORES_M],
+                                                  float (&global_sum)[Mma_tile_p::CORES_M]) {
+    float const scale = reinterpret_cast<float const&>(this->scale_bmm1_);
+    float(&local_max_)[Mma_tile_p::CORES_M] = this->local_max_;
+    float(&local_sum_)[Mma_tile_p::CORES_M] = this->local_sum_;
+    float(&correction_)[Mma_tile_p::CORES_M] = this->correction_;
+    float(&elt_)[Mma_tile_p::CORES_M][Mma_tile_p::CORES_N * 2] = this->elt_;
+
+// Row-wise max of current tile.
+#pragma unroll
+    for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++) {
+      if (IS_FIRST_COL) {
+        local_max_[mi] = elt_[mi][0];
+      } else {
+        local_max_[mi] = fmaxf(global_max[mi], elt_[mi][0]);
+      }
+#pragma unroll
+      for (int ni = 1; ni < Mma_tile_p::CORES_N * 2; ni++) {
+        local_max_[mi] = fmaxf(local_max_[mi], elt_[mi][ni]);
+      }
+      local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 1), local_max_[mi]);
+      local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 2), local_max_[mi]);
+    }
+
+// Softmax Exp.
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile_p::CORES_N; ni++) {
+#pragma unroll
+      for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++) {
+        // The equation exp2(scale * x - max) * q_scale_s
+        // equals to:   exp2(scale * x - max) * exp2(log2(q_scale_s))
+        // equals to:   exp2(scale * x - (max - log2(q_scale_s)))
+        //                   ^^^^^   ^   ^^^^^^^^^^^^^^^^^^^^^^^
+        // So instead of per-accumulator muls, we can do per-row subs which saves FP cycles.
+        // As we scale the softmax output early, before doing the local_sum, we have to unscale
+        // the local_sum afterwards.
+        float& p0 = elt_[mi][2 * ni + 0];
+        float& p1 = elt_[mi][2 * ni + 1];
+
+        // When all elts of the tile are -FLT_MAX, we have to make sure
+        //  expf generates 0 for all values instead of 1.
+        if constexpr (!EXP2F_OPTIMIZATION) {
+          float masked_max = (!CHECK_IF_NEG_INF_EXISTS || local_max_[mi] != -FLT_MAX)
+                                 ? local_max_[mi] - logf(q_scale_s_)
+                                 : 0.f;
+          p0 = expf(p0 - masked_max);
+          p1 = expf(p1 - masked_max);
+        } else {
+          // Use exp2f optimization for cases without alibi.
+          float masked_max = (!CHECK_IF_NEG_INF_EXISTS || local_max_[mi] != -FLT_MAX)
+                                 ? local_max_[mi] * scale - log2f(q_scale_s_)
+                                 : 0.f;
+          p0 = custom_exp2f(p0, scale, masked_max);
+          p1 = custom_exp2f(p1, scale, masked_max);
+        }
+      }
+    }
+
+// Row-wise sum of current tile.
+#pragma unroll
+    for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++) {
+      local_sum_[mi] = elt_[mi][0];
+#pragma unroll
+      for (int ni = 1; ni < Mma_tile_p::CORES_N * 2; ni++) {
+        local_sum_[mi] += elt_[mi][ni];
+      }
+      local_sum_[mi] += __shfl_xor_sync(uint32_t(-1), local_sum_[mi], 1);
+      local_sum_[mi] += __shfl_xor_sync(uint32_t(-1), local_sum_[mi], 2);
+    }
+
+    // Initialize or update the global sum and max.
+    if (IS_FIRST_COL) {
+#pragma unroll
+      for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++) {
+        global_sum[mi] = local_sum_[mi];
+        global_max[mi] = local_max_[mi];
+      }
+    } else {
+#pragma unroll
+      for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++) {
+        float max_old = global_max[mi];
+        float max_new = local_max_[mi];
+        float sum_old = global_sum[mi];
+        float sum_new = local_sum_[mi];
+        // Remove the old max and replace by the new one.
+        if constexpr (!EXP2F_OPTIMIZATION) {
+          correction_[mi] = expf(max_old - max_new);
+        } else {
+          // Use exp2f optimization for cases without alibi.
+          correction_[mi] = exp2f((max_old - max_new) * scale);
+        }
+        global_sum[mi] = sum_old * correction_[mi] + sum_new;
+
+        // New max already takes into account old max.
+        global_max[mi] = max_new;
+      }
+    }
+  }
+
+  // Update flash attention scales and pack elements for BMM2.
+  template <bool IS_FIRST_COL>
+  inline __device__ void pack(Compute_tile_o& ctile_o, Fragment_p (&frag_p)[Mma_tile_o::MMAS_K]) {
+// Pack 4 cols for BMM2 A tile.
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile_o::MMAS_K; ni++) {
+      // 1st row - 8 elements per row.
+      float tmp_00 = this->elt_[0][8 * ni + 0];  // + 0
+      float tmp_01 = this->elt_[0][8 * ni + 1];  // + 1
+      float tmp_02 = this->elt_[0][8 * ni + 2];  // + 8
+      float tmp_03 = this->elt_[0][8 * ni + 3];  // + 9
+      float tmp_04 = this->elt_[0][8 * ni + 4];  // +16
+      float tmp_05 = this->elt_[0][8 * ni + 5];  // +17
+      float tmp_06 = this->elt_[0][8 * ni + 6];  // +24
+      float tmp_07 = this->elt_[0][8 * ni + 7];  // +25
+
+      // 2nd row - 8 elements per row.
+      float tmp_10 = this->elt_[1][8 * ni + 0];  // + 0
+      float tmp_11 = this->elt_[1][8 * ni + 1];  // + 1
+      float tmp_12 = this->elt_[1][8 * ni + 2];  // + 8
+      float tmp_13 = this->elt_[1][8 * ni + 3];  // + 9
+      float tmp_14 = this->elt_[1][8 * ni + 4];  // +16
+      float tmp_15 = this->elt_[1][8 * ni + 5];  // +17
+      float tmp_16 = this->elt_[1][8 * ni + 6];  // +24
+      float tmp_17 = this->elt_[1][8 * ni + 7];  // +25
+
+      // Pack to 4 registers.
+      frag_p[ni].reg(0) =
+          fmha::float4_to_fp8x4<Kernel_traits::Element_data_type>(tmp_00, tmp_01, tmp_02, tmp_03);
+      frag_p[ni].reg(1) =
+          fmha::float4_to_fp8x4<Kernel_traits::Element_data_type>(tmp_10, tmp_11, tmp_12, tmp_13);
+      frag_p[ni].reg(2) =
+          fmha::float4_to_fp8x4<Kernel_traits::Element_data_type>(tmp_04, tmp_05, tmp_06, tmp_07);
+      frag_p[ni].reg(3) =
+          fmha::float4_to_fp8x4<Kernel_traits::Element_data_type>(tmp_14, tmp_15, tmp_16, tmp_17);
+    }
+
+    if (!IS_FIRST_COL) {
+// Correct accumulators to current max.
+#pragma unroll
+      for (int mi = 0; mi < Mma_tile_o::CORES_M; mi++) {
+        // Assume only N has multiple MMAs (MMAS_M = 1).
+        // MMAS_N > 1 when N dimension is split.
+        float correction = this->correction_[mi];
+#pragma unroll
+        for (int mma_ni = 0; mma_ni < Mma_tile_o::MMAS_N; mma_ni++) {
+#pragma unroll
+          for (int ni = 0; ni < Mma_tile_o::CORES_N; ni++) {
+            uint32_t& reg0 = ctile_o.acc_[0][mma_ni].reg(2 * ni * Mma_tile_o::CORES_M + 2 * mi);
+            uint32_t& reg1 = ctile_o.acc_[0][mma_ni].reg(2 * ni * Mma_tile_o::CORES_M + 2 * mi + 1);
+            asm volatile("mul.f32 %0, %0, %1;\n" : "+r"(reg0) : "f"(correction));
+            asm volatile("mul.f32 %0, %0, %1;\n" : "+r"(reg1) : "f"(correction));
+          }
+        }
+      }
+    } else {
+      ctile_o.clear();
+    }
+  }
+
+  static constexpr float q_scale_s_ = Traits_o::SOFTMAX_FP_QUANT_SCALE;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// BMM2 epilogue to apply scales (flash attention).
+// FP32 accumulation as default.
+template <template <int, int, int, bool, bool> class Traits, typename Kernel_traits>
+struct Tile_o_epilogue_base {
+  // The instruction traits for BMM2.
+  using Traits_o = typename Kernel_traits::Traits_o;
+
+  // The CTA description for BMM2.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The GMMA compute tile for BMM2.
+  using Compute_tile_o = typename Kernel_traits::Compute_tile_o;
+
+  // The MMA tile for the BMM2.
+  using Mma_tile_o = typename Kernel_traits::Mma_tile_o;
+
+  // Apply the exp2f optimization (fuse bmm1_scale and -max into FMAs).
+  enum { EXP2F_OPTIMIZATION = Kernel_traits::EXP2F_OPTIMIZATION };
+
+  template <typename Params, typename Block_info>
+  inline __device__ Tile_o_epilogue_base(Params const& params, Block_info& block_info) {
+    has_attention_sink_ = params.attention_sinks != nullptr;
+    head_idx_ = block_info.bidh;
+    attention_sink_ = has_attention_sink_ ? params.attention_sinks[block_info.bidh] : 0.f;
+    // It is only need when the exp2f optimization is enabled, so params.scale_bmm1 is always float.
+    scale_bmm1_f_ = reinterpret_cast<float const&>(params.scale_bmm1_d ? *params.scale_bmm1_d
+                                                                       : params.scale_bmm1);
+  };
+
+  // The attention sinks.
+  inline __device__ void add_attention_sink(float& sum, float max) {
+    if (has_attention_sink_) {
+      // The global max needs to be scaled by the bmm1 scale if exp2f optimization is enabled.
+      if constexpr (EXP2F_OPTIMIZATION) {
+        sum += exp2f(attention_sink_ * M_LOG2E - max * scale_bmm1_f_);
+      } else {
+        sum += expf(attention_sink_ - max);
+      }
+    }
+  }
+
+  // Scale ctile_o output by 1/sum
+  inline __device__ void scale(Compute_tile_o& ctile_o, float (&global_max)[Mma_tile_o::CORES_M],
+                               float (&global_sum)[Mma_tile_o::CORES_M]) {
+// Final step's update.
+#pragma unroll
+    for (int mi = 0; mi < Mma_tile_o::CORES_M; mi++) {
+      // The global sum.
+      float global_sum_mi = global_sum[mi];
+      // Add the attention sink to the global sum.
+      add_attention_sink(global_sum_mi, global_max[mi]);
+      // The scale.
+      float scale = global_sum_mi == 0.f ? 1.f : 1.0f / global_sum_mi;
+
+// Assume only N has multiple MMAs (MMAS_M = 1).
+#pragma unroll
+      for (int mma_ni = 0; mma_ni < Mma_tile_o::MMAS_N; mma_ni++) {
+#pragma unroll
+        for (int ni = 0; ni < Mma_tile_o::CORES_N; ni++) {
+          float& reg0 = ctile_o.acc_[0][mma_ni].elt(2 * ni * Mma_tile_o::CORES_M + 2 * mi);
+          float& reg1 = ctile_o.acc_[0][mma_ni].elt(2 * ni * Mma_tile_o::CORES_M + 2 * mi + 1);
+          reg0 *= scale;
+          reg1 *= scale;
+        }
+      }
+    }
+  }
+
+  // Whether the attention sink is enabled.
+  bool has_attention_sink_ = false;
+  // The attention sink value.
+  float attention_sink_ = 0.f;
+  // The float scale of bmm1 outputs.
+  float scale_bmm1_f_ = 1.f;
+  // The head idx.
+  int head_idx_ = 0;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <template <int, int, int, bool, bool> class Traits, typename Kernel_traits>
+struct Tile_o_epilogue {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Hopper_hgmma_fp16_traits
+template <typename Kernel_traits>
+struct Tile_o_epilogue<Hopper_hgmma_fp16_traits, Kernel_traits>
+    : public Tile_o_epilogue_base<Hopper_hgmma_fp16_traits, Kernel_traits> {
+  // The Base class.
+  using Base = Tile_o_epilogue_base<Hopper_hgmma_fp16_traits, Kernel_traits>;
+
+  // The instruction traits for BMM2.
+  using Traits_o = typename Base::Traits_o;
+
+  // The CTA description for BMM2.
+  using Cta_tile_o = typename Base::Cta_tile_o;
+
+  // The GMMA compute tile for BMM2.
+  using Compute_tile_o = typename Base::Compute_tile_o;
+
+  // The MMA tile for the BMM2.
+  using Mma_tile_o = typename Base::Mma_tile_o;
+
+  // Base constructor.
+  using Base::Tile_o_epilogue_base;
+
+  // Scale ctile_o output by 1/sum
+  inline __device__ void scale(Compute_tile_o& ctile_o, float (&global_max)[Mma_tile_o::CORES_M],
+                               float (&global_sum)[Mma_tile_o::CORES_M]) {
+// Final step's update.
+#pragma unroll
+    for (int mi = 0; mi < Mma_tile_o::CORES_M; mi++) {
+      // The global sum.
+      float global_sum_mi = global_sum[mi];
+      // Add the attention sink to the global sum.
+      this->add_attention_sink(global_sum_mi, global_max[mi]);
+      // The scale.
+      float scale = global_sum_mi == 0.f ? 1.f : 1.0f / global_sum_mi;
+      // The scale.
+      const uint32_t scale_h = float_to_half2(scale);
+
+// Assume only N has multiple MMAs (MMAS_M = 1).
+#pragma unroll
+      for (int mma_ni = 0; mma_ni < Mma_tile_o::MMAS_N; mma_ni++) {
+#pragma unroll
+        for (int ni = 0; ni < Mma_tile_o::CORES_N; ni++) {
+          uint32_t& reg = ctile_o.acc_[0][mma_ni].reg(ni * Mma_tile_o::CORES_M + mi);
+          reg = hmul2(reg, scale_h);
+        }
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Hopper_hgmma_fp32_traits
+template <typename Kernel_traits>
+struct Tile_o_epilogue<Hopper_hgmma_fp32_traits, Kernel_traits>
+    : public Tile_o_epilogue_base<Hopper_hgmma_fp32_traits, Kernel_traits> {
+  // The Base class.
+  using Base = Tile_o_epilogue_base<Hopper_hgmma_fp32_traits, Kernel_traits>;
+
+  // Base constructor.
+  using Base::Tile_o_epilogue_base;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Hopper_hgmma_bf16_traits
+template <typename Kernel_traits>
+struct Tile_o_epilogue<Hopper_hgmma_bf16_traits, Kernel_traits>
+    : public Tile_o_epilogue_base<Hopper_hgmma_bf16_traits, Kernel_traits> {
+  // The Base class.
+  using Base = Tile_o_epilogue_base<Hopper_hgmma_bf16_traits, Kernel_traits>;
+
+  // Base constructor.
+  using Base::Tile_o_epilogue_base;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Hopper_qgmma_e4m3_fp32_traits
+template <typename Kernel_traits>
+struct Tile_o_epilogue<Hopper_qgmma_e4m3_fp32_traits, Kernel_traits>
+    : public Tile_o_epilogue_base<Hopper_qgmma_e4m3_fp32_traits, Kernel_traits> {
+  // The Base class.
+  using Base = Tile_o_epilogue_base<Hopper_qgmma_e4m3_fp32_traits, Kernel_traits>;
+
+  // The instruction traits for BMM2.
+  using Traits_o = typename Base::Traits_o;
+
+  // The CTA description for BMM2.
+  using Cta_tile_o = typename Base::Cta_tile_o;
+
+  // The GMMA compute tile for BMM2.
+  using Compute_tile_o = typename Base::Compute_tile_o;
+
+  // The MMA tile for the BMM2.
+  using Mma_tile_o = typename Base::Mma_tile_o;
+
+  // Apply the exp2f optimization (fuse bmm1_scale and -max into FMAs).
+  enum { EXP2F_OPTIMIZATION = Base::EXP2F_OPTIMIZATION };
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Tile_o_epilogue(Params const& params, Block_info& block_info)
+      : Base(params, block_info), scale_bmm2_(*params.scale_bmm2_d) {}
+
+  // Add the attention sink to the global sum.
+  inline __device__ void add_attention_sink(float& sum, float max) {
+    if (this->has_attention_sink_) {
+      // The global max needs to be scaled by the bmm1 scale if exp2f optimization is enabled.
+      // Take the log2f(Traits_o::SOFTMAX_FP_QUANT_SCALE) into account as the same scale has been
+      // applied to sum.
+      float quant_scale_in_log2 = log2f(Traits_o::SOFTMAX_FP_QUANT_SCALE);
+      if constexpr (EXP2F_OPTIMIZATION) {
+        sum += exp2f(this->attention_sink_ * M_LOG2E - max * this->scale_bmm1_f_ +
+                     quant_scale_in_log2);
+      } else {
+        sum += expf(this->attention_sink_ - max + quant_scale_in_log2);
+      }
+    }
+  }
+
+  // Scale ctile_o output by 1/sum
+  inline __device__ void scale(Compute_tile_o& ctile_o, float (&global_max)[Mma_tile_o::CORES_M],
+                               float (&global_sum)[Mma_tile_o::CORES_M]) {
+// Final step's update.
+#pragma unroll
+    for (int mi = 0; mi < Mma_tile_o::CORES_M; mi++) {
+      // The global sum.
+      float global_sum_mi = global_sum[mi];
+      // Add the attention sink to the global sum.
+      add_attention_sink(global_sum_mi, global_max[mi]);
+#ifdef UNIFIED_EPILOGUE_SCALE
+      // Descaling factor
+      float const scale_bmm2_f_ = reinterpret_cast<float&>(scale_bmm2_);
+      // The scale.
+      float scale = global_sum_mi == 0.f ? scale_bmm2_f_ : scale_bmm2_f_ / global_sum_mi;
+#else
+      float scale = global_sum_mi == 0.f ? 1.0f : 1.0f / global_sum_mi;
+#endif
+      if constexpr (Kernel_traits::RETURN_SOFTMAX_STATS) {
+        // Save the dequant exp sum for softmax saver.
+        global_sum[mi] *= Traits_o::SOFTMAX_FP_DEQUANT_SCALE;
+      }
+// Assume only N has multiple MMAs (MMAS_M = 1).
+#pragma unroll
+      for (int mma_ni = 0; mma_ni < Mma_tile_o::MMAS_N; mma_ni++) {
+#pragma unroll
+        for (int ni = 0; ni < Mma_tile_o::CORES_N; ni++) {
+          float& reg0 = ctile_o.acc_[0][mma_ni].elt(2 * ni * Mma_tile_o::CORES_M + 2 * mi);
+          float& reg1 = ctile_o.acc_[0][mma_ni].elt(2 * ni * Mma_tile_o::CORES_M + 2 * mi + 1);
+          reg0 *= scale;
+          reg1 *= scale;
+        }
+      }
+    }
+  }
+
+  // Scale_bmm2.
+  uint32_t scale_bmm2_;
+};
+
+}  // namespace ws
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fmha/warpspec/kernel_traits.h b/csrc/fmha_v2/fmha/warpspec/kernel_traits.h
new file mode 100644
index 0000000000..6d96968c61
--- /dev/null
+++ b/csrc/fmha_v2/fmha/warpspec/kernel_traits.h
@@ -0,0 +1,574 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+#include <fmha/hopper/compute_tile.h>
+#include <fmha/hopper/gmem_tile_o_packed.h>
+#include <fmha/hopper/smem_tile.h>
+#include <fmha/numeric_types.h>
+#include <fmha/utils.h>
+#include <fmha/warpspec/circular_buffer.h>
+#include <fmha/warpspec/epilogue.h>
+
+#include <cuda/std/array>
+
+namespace fmha {
+namespace ws {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The instruction trait template for initializing BMM1 and BMM2 traits.
+    template <int, int, int, bool, bool> class Instruction_traits,
+    // The step size in query sequence dimension (M of BMM1 and BMM2).
+    int STEP_Q_,
+    // The step size in key/value sequence dimension (N of BMM1 and K of BMM2).
+    int STEP_KV_,
+    // The head dimension.
+    int D_,
+    // The head dimension of V.
+    int DV_,
+    // The number of smem buffers for Q tiles.
+    int Q_BUFFERS_,
+    // The number of smem buffers for K, and V tiles.
+    int KV_BUFFERS_,
+    // The number of compute warpgroups (128 threads per warpgroup).
+    int NUM_COMPUTE_GROUPS_,
+    // The number of data warpgroups (TMA).
+    int DMA2COMPUTE_DEPTH_,
+    // The attention mask type: padding (0), causal (1), sliding_window_causal (2), custom_mask (3).
+    // See fused_multihead_attention_kernel.h for description.
+    int ATTENTION_MASK_TYPE_ = 0,
+    // Is head interleaved ?
+    // (head_interleaved means input [bxs, h, 3, d], otherwise [bx3, 3, h, d]).
+    bool HEADS_INTERLEAVED_ = true,
+    // Are we applying alibi bias (drop FMA optimizations for accuracy reasons).
+    bool APPLY_ALIBI_ = false,
+    // Enable mutex to overlap mma and softmax ?
+    bool ENABLE_MUTEX_ = true,
+    // The tile scheduling mode: static (0), dynamic unbalanced (1), dynamic balanced (2)
+    int SCHEDULING_MODE_ = 0,
+    // The qkv input layout: packed_qkv (0), contiguous_q_kv (1), q_paged_kv (2).
+    int INPUT_LAYOUT_ = 0,
+    // Whether use UTMASTG in epilogue. This is ignored for FP16/BF16 at the moment.
+    bool USE_TMA_STORE_ = false,
+    // Enable softcapping_scale to the qk products ? (from Grok models)
+    bool ENABLE_BMM1_SOFTCAPPING_SCALE_ = false,
+    // Save softmax stats ?
+    bool RETURN_SOFTMAX_STATS_ = false,
+    // The output type (only used by fp8 kernels).
+    typename OutputType = typename Instruction_traits<STEP_Q_, STEP_KV_, 0, false, false>::A_type,
+    // The sage attention block size for Q, K and V
+    int SAGE_BLOCK_SIZE_Q_ = 0, int SAGE_BLOCK_SIZE_K_ = 0, int SAGE_BLOCK_SIZE_V_ = 0>
+struct Kernel_traits {
+  // The step size in query sequence dimension (M of BMM1 and BMM2).
+  enum { STEP_Q = STEP_Q_ };
+
+  // The step size in key/value sequence dimension (N of BMM1 and K of BMM2).
+  enum { STEP_KV = STEP_KV_ };
+
+  // The valid head dimension.
+  enum { VALID_D = D_ };
+
+  enum { VALID_DV = (DV_ == 0 ? D_ : DV_) };
+
+  // Bootstrap GMMA_K from dummy Instruction_traits where FP16/BF16 K = 16, FP8 K = 32.
+  enum { GMMA_K = Instruction_traits<STEP_Q, STEP_KV, 0, false, false>::GMMA_K };
+
+  // The instruction traits for the BMM1.
+  using Traits_p = Instruction_traits<STEP_Q, STEP_KV, GMMA_K, false, false>;
+
+  // The element type.
+  using Element_data_type = typename Traits_p::A_type;
+
+  // The bytes per element.
+  enum { ELEMENT_BYTES = sizeof(Element_data_type) };
+
+  // The padded head dimension.
+  enum {
+    D = std::min<int>(Round_up<VALID_D, 128 / ELEMENT_BYTES>::VALUE,
+                      Next_power_of_two<VALID_D>::VALUE)
+  };
+
+  enum {
+    DV = std::min<int>(Round_up<VALID_DV, 128 / ELEMENT_BYTES>::VALUE,
+                       Next_power_of_two<VALID_DV>::VALUE)
+  };
+
+  // The number of smem buffers for Q tiles.
+  enum { Q_BUFFERS = Q_BUFFERS_ };
+
+  // The number of smem buffers for K, and V tiles.
+  enum { KV_BUFFERS = KV_BUFFERS_ };
+
+  // Whether read from paged kv buffers or not.
+  enum { PAGED_KV_INPUT = INPUT_LAYOUT_ == 2 };
+
+  // Whether Q and KV is in separate buffer, which means we need to consider different Q and KV
+  // lengths.
+  enum { SEPARATE_Q_KV_BUFFER = INPUT_LAYOUT_ > 0 };
+
+  // Whether use UTMASTG in epilogue. This is always false for FP16/BF16 at the moment.
+  enum { USE_TMA_STORE = 0 };
+
+  // SageAttention needs fp8 input
+  enum {
+    SAGE_ATTENTION = SAGE_BLOCK_SIZE_Q_ > 0 || SAGE_BLOCK_SIZE_K_ > 0 || SAGE_BLOCK_SIZE_V_ > 0
+  };
+
+  enum { SAGE_BLOCK_SIZE_Q = SAGE_BLOCK_SIZE_Q_ };
+
+  enum { SAGE_BLOCK_SIZE_K = SAGE_BLOCK_SIZE_K_ };
+
+  enum { SAGE_BLOCK_SIZE_V = SAGE_BLOCK_SIZE_V_ };
+
+  // Whether the dma group transposes the v tile explicitly.
+  enum {
+    DMA_GROUP_TRANSPOSE_V = (std::is_same<Element_data_type, fmha::e4m3_t>::value ||
+                             std::is_same<Element_data_type, fmha::e5m2_t>::value)
+  };
+
+  // The number of smem scratch buffer for staging V transpose for Hopper QGMMA
+  enum { V_SCRATCH_BUFFERS = DMA_GROUP_TRANSPOSE_V ? 1 : 0 };
+
+  // The number of compute warpgroups (128 threads per warpgroup).
+  enum { NUM_COMPUTE_GROUPS = NUM_COMPUTE_GROUPS_ };
+
+  // The number of data warpgroups (TMA).
+  enum { DMA2COMPUTE_DEPTH = DMA2COMPUTE_DEPTH_ };
+
+  // The number of ctas per cluster.
+  enum { CTAS_PER_CGA = 1 };
+
+  // The total number of threads per block,
+  enum { THREADS = 128 + NUM_COMPUTE_GROUPS * 128 };
+
+  // The number of warps in the M dimension.
+  enum { WARPS_M = 4 };
+
+  // The number of warpgroups in the M dimensions.
+  enum { WARP_GROUP_M = WARPS_M / 4 };
+
+  // The number of warps in the N dimension.
+  enum { WARPS_N = 1 };
+
+  // The number of warpgroups in the N dimension.
+  enum { WARP_GROUP_N = WARPS_N };
+
+  // The number of warpgroups in the K dimension.
+  enum { WARP_GROUP_K = 1 };
+
+  // The attention mask type: padding (0), causal (1), sliding_or_chunked_attention (2), custom_mask
+  // (3).
+  enum { CAUSAL_MASK = (ATTENTION_MASK_TYPE_ == 1 || ATTENTION_MASK_TYPE_ == 2) };
+
+  enum { SLIDING_OR_CHUNKED_ATTENTION = ATTENTION_MASK_TYPE_ == 2 };
+
+  // Is head interleaved ?
+  // (head_interleaved means input [bxs, h, 3, d], otherwise [bx3, 3, h, d]).
+  enum { HEADS_INTERLEAVED = HEADS_INTERLEAVED_ };
+
+  // Are we applying alibi bias (drop FMA optimizations for accuracy reasons).
+  enum { APPLY_ALIBI = APPLY_ALIBI_ };
+
+  // Are we save the softmax stats?
+  enum { RETURN_SOFTMAX_STATS = RETURN_SOFTMAX_STATS_ };
+
+  // Are we applying softcapping scale for qk products ?
+  enum { ENABLE_BMM1_SOFTCAPPING_SCALE = ENABLE_BMM1_SOFTCAPPING_SCALE_ };
+
+  // Use the custom mask input ( attention_mask_type == 3.)
+  enum { USE_CUSTOM_MASK = ATTENTION_MASK_TYPE_ == 3 };
+
+  static_assert(!USE_CUSTOM_MASK || STEP_KV == 64 || STEP_KV == 128 || STEP_KV == 256,
+                "Not implemented!");
+
+  // Apply the exp2f optimization (fuse bmm1_scale and -max into FMAs).
+  // Performance degradation when enabled exp2f tricks with dense mask.
+  // with softcapping scale, exp2f optimization cannot work.
+  enum { EXP2F_OPTIMIZATION = !APPLY_ALIBI && !ENABLE_BMM1_SOFTCAPPING_SCALE };
+
+  // Enable mutex to overlap mma and softmax ?
+  enum { ENABLE_MUTEX = ENABLE_MUTEX_ };
+
+  // The tile scheduling mode: static (0), dynamic (1)
+  enum { SCHEDULING_MODE = SCHEDULING_MODE_ };
+
+  // The bytes of head dimension.
+  enum { D_BYTES = D * ELEMENT_BYTES };
+
+  // Split D into multiple groups in order to match the TMA swizzle mode (128B).
+  // 1. BMM1: we split D into multiple K groups.
+  // 2. BMM2: we split D into multiple N groups,
+  //          but only have one MMA_N as we can use leading_dim_offset to handle this.
+
+  // The number of head_dimension groups.
+  enum { D_GROUPS = fmha::Div_up<D_BYTES, 128>::VALUE };
+
+  // The head_dimension per group.
+  enum { D_PER_GROUP = D / D_GROUPS };
+
+  static_assert(D_GROUPS * D_PER_GROUP == D);
+
+  // The head_dimension bytes per group
+  enum { D_BYTES_PER_GROUP = D_BYTES / D_GROUPS };
+
+  // The bytes of head dimension of V.
+  enum { DV_BYTES = DV * ELEMENT_BYTES };
+
+  // The number of head_dimension groups of V.
+  enum { DV_GROUPS = fmha::Div_up<DV_BYTES, 128>::VALUE };
+
+  // QGMMA: BMM2 will be split into multiple K groups as we explicitly transpose v (128 * D) in the
+  // smem. HGMMA: BMM2 will load from row-major (K * N) smem_v, so we don't need to explicitly split
+  // K.
+  static constexpr auto BMM2_LEADING_DIM_BYTES = ELEMENT_BYTES == 1 ? 128 : STEP_KV * ELEMENT_BYTES;
+
+  enum { BMM2_K_GROUPS = fmha::Div_up<STEP_KV * ELEMENT_BYTES, BMM2_LEADING_DIM_BYTES>::VALUE };
+
+  // The K dimension per group
+  enum { BMM2_K_PER_GROUP = fmha::Div_up<STEP_KV, BMM2_K_GROUPS>::VALUE };
+
+  // The K dimension bytes per group
+  enum { BMM2_K_BYTES_PER_GROUP = ELEMENT_BYTES * BMM2_K_PER_GROUP * BMM2_K_GROUPS };
+
+  // Set GMMA descriptor mode based on the head_size.
+  static constexpr auto GMMA_DESC_MODE =
+      (D_BYTES_PER_GROUP > 64   ? fmha::Gmma_descriptor_mode::SWIZZLE_128B
+       : D_BYTES_PER_GROUP > 32 ? fmha::Gmma_descriptor_mode::SWIZZLE_64B
+                                : fmha::Gmma_descriptor_mode::SWIZZLE_32B);
+
+  // Named barrier ids
+  static constexpr int DMA_SYNC_BARRIER_ID = 0x1;
+  static constexpr int MMA_SYNC_BARRIER_ID = 0x2;
+
+  // How many threads get involved in the dma group.
+  enum { NUM_THREADS_IN_DMA_GROUP = DMA_GROUP_TRANSPOSE_V ? 128 : (PAGED_KV_INPUT ? 1 : 32) };
+
+  // The instruction traits for the BMM2.
+  // FP16/BF16 K = 16, FP8 K = 32.
+  using Traits_o = Instruction_traits<STEP_Q, DV, GMMA_K, true, false>;
+
+  // The CTA description for BMM1.
+  using Cta_tile_p = typename Traits_p::template Cta_tile<STEP_Q, STEP_KV, D, WARP_GROUP_M,
+                                                          WARP_GROUP_N, WARP_GROUP_K>;
+
+  // The CTA description for BMM1 (after head_dimension is split).
+  using Cta_tile_p_split_d =
+      typename Traits_p::template Cta_tile<STEP_Q, STEP_KV, D_PER_GROUP, WARP_GROUP_M, WARP_GROUP_N,
+                                           WARP_GROUP_K>;
+
+  // The CTA description for BMM2.
+  using Cta_tile_o =
+      typename Traits_o::template Cta_padded_tile<STEP_Q, DV, STEP_KV, VALID_DV, STEP_KV,
+                                                  WARP_GROUP_M, WARP_GROUP_K, WARP_GROUP_N>;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = typename Traits_p::template Mma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = typename Traits_o::template Mma_tile<Cta_tile_o>;
+
+  // Smem_tiles are currently only used as meta data for the compute tile.
+  // The Q shared memory tile.
+  using Smem_tile_q = fmha::Smem_tile_hopper_a<Traits_p, Cta_tile_p_split_d, fmha::Row, 16,
+                                               Q_BUFFERS * D_GROUPS, GMMA_DESC_MODE,
+                                               true,  // USE_TMA_Q
+                                               Traits_p::GMMA_A_RF>;
+
+  // The K shared memory tile.
+  using Smem_tile_k = fmha::Smem_tile_hopper_b<Traits_p, Cta_tile_p_split_d, fmha::Col, 16,
+                                               KV_BUFFERS * D_GROUPS, GMMA_DESC_MODE,
+                                               true  // USE_TMA_K
+                                               >;
+
+  // The V shared memory tile.
+  using Smem_tile_v =
+      fmha::Smem_tile_hopper_b<Traits_o, Cta_tile_o, fmha::Row, 16, KV_BUFFERS, GMMA_DESC_MODE,
+                               true  // USE_TMA_V
+                               >;
+
+  // The GMMA compute tile for BMM1.
+  using Compute_tile_p =
+      fmha::Compute_tile_with_gmma<Traits_p, Cta_tile_p, Smem_tile_q, Smem_tile_k,
+                                   Traits_p::GMMA_A_RF, Traits_p::GMMA_B_RF>;
+
+  // The GMMA compute tile for BMM2.
+  using Compute_tile_o =
+      fmha::Compute_tile_with_gmma<Traits_o, Cta_tile_o, Smem_tile_q, Smem_tile_v,
+                                   Traits_o::GMMA_A_RF, Traits_o::GMMA_B_RF>;
+
+  // The global memory tile for O.
+  using Gmem_tile_o = fmha::v2::Gmem_tile_o_hopper<Traits_o, Cta_tile_o, Cta_tile_o::WARPS_K>;
+
+  // The q, k, v tile buffer.
+  using Buffer_q_t = cuda::std::array<Element_data_type, D * STEP_Q * Q_BUFFERS>;
+  using Buffer_k_t = cuda::std::array<Element_data_type, D * STEP_KV * KV_BUFFERS>;
+  using Buffer_v_t = cuda::std::array<Element_data_type, DV * STEP_KV * KV_BUFFERS>;
+  // We need one kv buffer to explicitly transose fp8 smem_tile.
+  using Buffer_v_scratch_t = cuda::std::array<Element_data_type, DV * STEP_KV * V_SCRATCH_BUFFERS>;
+
+  // The smem bytes of q, k, v tiles.
+  enum {
+    SMEM_BYTES_Q = sizeof(Buffer_q_t),
+    SMEM_BYTES_K = sizeof(Buffer_k_t),
+    SMEM_BYTES_V = sizeof(Buffer_v_t),
+  };
+
+  // The reader/writer (consumer/producer) barriers.
+  using Circular_buffer_kv_reader = typename CircularBuffer<KV_BUFFERS, CTAS_PER_CGA>::Reader;
+  using Circular_buffer_kv_writer = typename CircularBuffer<KV_BUFFERS, CTAS_PER_CGA>::Writer;
+  using Circular_buffer_q_reader = typename CircularBuffer<Q_BUFFERS, CTAS_PER_CGA>::Reader;
+  using Circular_buffer_q_writer = typename CircularBuffer<Q_BUFFERS, CTAS_PER_CGA>::Writer;
+  using Circular_buffer_v_scratch_reader =
+      typename CircularBuffer<V_SCRATCH_BUFFERS, CTAS_PER_CGA>::Reader;
+  using Circular_buffer_v_scratch_writer =
+      typename CircularBuffer<V_SCRATCH_BUFFERS, CTAS_PER_CGA>::Writer;
+
+  // The struct of shared memory buffers.
+  struct __align__(128) Shared {
+    // The smem buffer of q, k, v tiles
+    Buffer_q_t smem_q[NUM_COMPUTE_GROUPS];
+    Buffer_k_t smem_k;
+    Buffer_v_t smem_v;
+    uint32_t tile_id;
+
+    // The head info to be shared among compute groups
+    struct Head_info {
+      // How many steps to execute.
+      int q_steps;
+      // The start tile offset for query.
+      int local_q_tile_offset;
+
+      union {
+        // The start tile offset for query (counting the past query length).
+        // Used by fixed-pattern mask types like padding, causal, sliding_window_causal
+        int q_tile_offset;
+        // The mask sum_s.
+        // Used by custom mask input.
+        int mask_sum_s;
+      };
+
+      // How many steps to execute.
+      int kv_steps;
+      // The actual query sequence length (variable sequence length).
+      int actual_seqlen;
+      // The actual key/value sequence length (variable sequence length).
+      int actual_kv_seqlen;
+      // The batch/head index.
+      int bidx;
+      // The head index.
+      int bidh;
+      int bidb;
+    };
+
+    // DMA to Compute:
+    // In this use case it probably makes sense to have the same number of BUFFERS in both queues.
+    // - barriers to wait for K+V loads to complete.
+    CircularBuffer<KV_BUFFERS, CTAS_PER_CGA> tma_k_tracker;
+    CircularBuffer<KV_BUFFERS, CTAS_PER_CGA> tma_v_tracker;
+    CircularBuffer<Q_BUFFERS, CTAS_PER_CGA> tma_q_tracker[NUM_COMPUTE_GROUPS];
+    // Not used for fp16/bf16 kernels.
+    CircularBuffer<V_SCRATCH_BUFFERS, CTAS_PER_CGA> tma_v_scratch_tracker;
+    CircularBufferWithData<DMA2COMPUTE_DEPTH, Head_info> head_info_tracker[NUM_COMPUTE_GROUPS];
+
+    // Mutex
+    OrderedMutex compute_mutex;
+
+    inline __device__ void init(int tid0) {
+#pragma unroll
+      for (int i = 0; i < NUM_COMPUTE_GROUPS; i++) {
+        tma_q_tracker[i].init(tid0, 1, CTAS_PER_CGA);
+        head_info_tracker[i].init(tid0, /*producer_threads=*/1, /*consumer_threads=*/128);
+      }
+
+      tma_k_tracker.init(tid0, 1, NUM_COMPUTE_GROUPS);
+      tma_v_tracker.init(tid0, 1, NUM_COMPUTE_GROUPS);
+
+      compute_mutex.init(tid0, 128, 128);
+    }
+  };
+
+  enum { BYTES_PER_SMEM = sizeof(Shared) };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Specialized kernel traits for Hopper_qgmma_e4m3_fp32_traits.
+template <  // The step size in query sequence dimension (M of BMM1 and BMM2).
+    int STEP_Q_,
+    // The step size in key/value sequence dimension (N of BMM1 and K of BMM2).
+    int STEP_KV_,
+    // The head dimension.
+    int D_,
+    // The head dimension of V.
+    int DV_,
+    // The number of smem buffers for Q tiles.
+    int Q_BUFFERS_,
+    // The number of smem buffers for K, and V tiles.
+    int KV_BUFFERS_,
+    // The number of compute warpgroups (128 threads per warpgroup).
+    int NUM_COMPUTE_GROUPS_,
+    // The number of data warpgroups (TMA).
+    int DMA2COMPUTE_DEPTH_,
+    // The attention mask type: padding (0), causal (1), sliding_window_causal (2).
+    // See fused_multihead_attention_kernel.h for description.
+    int ATTENTION_MASK_TYPE_ = 0,
+    // Is head interleaved ?
+    // (head_interleaved means input [bxs, h, 3, d], otherwise [bx3, 3, h, d]).
+    bool HEADS_INTERLEAVED_ = true,
+    // Are we applying alibi bias (drop FMA optimizations for accuracy reasons).
+    bool APPLY_ALIBI_ = false,
+    // Enable mutex to overlap mma and softmax ?
+    bool ENABLE_MUTEX_ = true,
+    // The tile scheduling mode: static (0), dynamic unbalanced (1), dynamic balanced (2).
+    int SCHEDULING_MODE_ = 0,
+    // The qkv input layout: packed_qkv (0), contiguous_q_kv (1), q_paged_kv (2).
+    int INPUT_LAYOUT_ = 0,
+    // Whether use UTMASTG in epilogue. This is ignored for FP16/BF16 at the moment.
+    bool USE_TMA_STORE_ = false,
+    // Enable softcapping scale to the qk products ? (from Grok models)
+    bool ENABLE_BMM1_SOFTCAPPING_SCALE_ = false,
+    // Save softmax stats ?
+    bool RETURN_SOFTMAX_STATS_ = false,
+    // The output type (only used by fp8 kernels).
+    typename OutputType = e4m3_t,
+    // The sage attention block size for Q, K and V
+    int SAGE_BLOCK_SIZE_Q_ = 0, int SAGE_BLOCK_SIZE_K_ = 0, int SAGE_BLOCK_SIZE_V_ = 0>
+struct Kernel_traits_Hopper_qgmma_e4m3_fp32
+    : public Kernel_traits<Hopper_qgmma_e4m3_fp32_traits, STEP_Q_, STEP_KV_, D_, DV_, Q_BUFFERS_,
+                           KV_BUFFERS_, NUM_COMPUTE_GROUPS_, DMA2COMPUTE_DEPTH_,
+                           ATTENTION_MASK_TYPE_, HEADS_INTERLEAVED_, APPLY_ALIBI_, ENABLE_MUTEX_,
+                           SCHEDULING_MODE_, INPUT_LAYOUT_, USE_TMA_STORE_,
+                           ENABLE_BMM1_SOFTCAPPING_SCALE_, RETURN_SOFTMAX_STATS_, OutputType,
+                           SAGE_BLOCK_SIZE_Q_, SAGE_BLOCK_SIZE_K_, SAGE_BLOCK_SIZE_V_> {
+  // Base class.
+  using Base = Kernel_traits<Hopper_qgmma_e4m3_fp32_traits, STEP_Q_, STEP_KV_, D_, DV_, Q_BUFFERS_,
+                             KV_BUFFERS_, NUM_COMPUTE_GROUPS_, DMA2COMPUTE_DEPTH_,
+                             ATTENTION_MASK_TYPE_, HEADS_INTERLEAVED_, APPLY_ALIBI_, ENABLE_MUTEX_,
+                             SCHEDULING_MODE_, INPUT_LAYOUT_, USE_TMA_STORE_,
+                             ENABLE_BMM1_SOFTCAPPING_SCALE_, RETURN_SOFTMAX_STATS_, OutputType,
+                             SAGE_BLOCK_SIZE_Q_, SAGE_BLOCK_SIZE_K_, SAGE_BLOCK_SIZE_V_>;
+
+  enum { USE_TMA_STORE = USE_TMA_STORE_ };
+
+  enum { O_BUFFERS = USE_TMA_STORE ? 1 : 0 };
+
+  // Inherit Traits_o, Cta_tile_o, Smem_tile_q.
+  using Traits_o = typename Base::Traits_o;
+  using Cta_tile_o = typename Base::Cta_tile_o;
+  using Smem_tile_q = typename Base::Smem_tile_q;
+
+  // The V shared memory tile.
+  // For true case below, as QGMMA only supports K-major, Smem_tile_v remaps row-major to col-major
+  // as well as the GMMA descriptor for V.
+  using Smem_tile_v =
+      fmha::Smem_tile_v<Traits_o, Cta_tile_o, Base::KV_BUFFERS, Base::GMMA_DESC_MODE, false>;
+
+  // The GMMA compute tile for BMM2.
+  using Compute_tile_o =
+      fmha::Compute_tile_with_gmma<Traits_o, Cta_tile_o, Smem_tile_q, Smem_tile_v,
+                                   Traits_o::GMMA_A_RF, Traits_o::GMMA_B_RF>;
+
+  // The global memory tile for O.
+  using Gmem_tile_o =
+      std::conditional_t<std::is_same_v<OutputType, e4m3_t>,
+                         // e4m3 output
+                         fmha::v2::Gmem_tile_o_hopper_32bit_8bit<
+                             Traits_o, Cta_tile_o, Cta_tile_o::WARPS_K, USE_TMA_STORE>,
+                         // fp16/bf16 output
+                         fmha::v2::Gmem_tile_o_qgmma_fp32_16bits<Traits_o, Cta_tile_o, OutputType>>;
+
+  // Inherit Buffer qkv class.
+  using Buffer_q_t = typename Base::Buffer_q_t;
+  using Buffer_k_t = typename Base::Buffer_k_t;
+  using Buffer_v_t = typename Base::Buffer_v_t;
+  // We need one kv buffer to explicitly transose fp8 smem_tile.
+  using Buffer_v_scratch_t = typename Base::Buffer_v_scratch_t;
+  // Extra O buffer if TMA is used for epilogue
+  using Element_data_type = typename Base::Element_data_type;
+  using Buffer_o_t = cuda::std::array<Element_data_type, Base::DV * Base::STEP_Q * O_BUFFERS>;
+
+  // The struct of shared memory buffers.
+  struct __align__(128) Shared {
+    // The smem buffer of q, k, v tiles
+    Buffer_q_t smem_q[Base::NUM_COMPUTE_GROUPS];
+    Buffer_k_t smem_k;
+    Buffer_v_t smem_v;
+    Buffer_v_scratch_t smem_v_scratch;
+    Buffer_o_t smem_o[Base::NUM_COMPUTE_GROUPS];
+    uint32_t tile_id;
+
+    // The head info to be shared among compute groups
+    struct Head_info {
+      // How many steps to execute.
+      int q_steps;
+      // The start tile offset for query.
+      int local_q_tile_offset;
+
+      union {
+        // The start tile offset for query (counting the past query length).
+        // Used by fixed-pattern mask types like padding, causal, sliding_window_causal
+        int q_tile_offset;
+        // The mask sum_s.
+        // Used by custom mask input.
+        int mask_sum_s;
+      };
+
+      // How many steps to execute.
+      int kv_steps;
+      // The actual query sequence length (variable sequence length).
+      int actual_seqlen;
+      // The actual key/value sequence length (variable sequence length).
+      int actual_kv_seqlen;
+      // The batch/head index.
+      int bidx;
+      // The head index.
+      int bidh;
+      int bidb;
+    };
+
+    // DMA to Compute:
+    // In this use case it probably makes sense to have the same number of BUFFERS in both queues.
+    // - barriers to wait for K+V loads to complete.
+    CircularBuffer<Base::KV_BUFFERS, Base::CTAS_PER_CGA> tma_k_tracker;
+    CircularBuffer<Base::KV_BUFFERS, Base::CTAS_PER_CGA> tma_v_tracker;
+    CircularBuffer<Base::Q_BUFFERS, Base::CTAS_PER_CGA> tma_q_tracker[Base::NUM_COMPUTE_GROUPS];
+    CircularBuffer<Base::V_SCRATCH_BUFFERS, Base::CTAS_PER_CGA> tma_v_scratch_tracker;
+    CircularBufferWithData<Base::DMA2COMPUTE_DEPTH, Head_info>
+        head_info_tracker[Base::NUM_COMPUTE_GROUPS];
+
+    // Mutex
+    OrderedMutex compute_mutex;
+
+    inline __device__ void init(int tid0) {
+#pragma unroll
+      for (int i = 0; i < Base::NUM_COMPUTE_GROUPS; i++) {
+        tma_q_tracker[i].init(tid0, 1, Base::CTAS_PER_CGA);
+        head_info_tracker[i].init(tid0, /*producer_threads=*/1, /*consumer_threads=*/128);
+      }
+
+      tma_k_tracker.init(tid0, 1, Base::NUM_COMPUTE_GROUPS);
+      tma_v_tracker.init(tid0, 1, Base::NUM_COMPUTE_GROUPS);
+
+      tma_v_scratch_tracker.init(tid0, 1, 1);  // producer/consumer in same warp
+
+      compute_mutex.init(tid0, 128, 128);
+    }
+  };
+
+  enum { BYTES_PER_SMEM = sizeof(Shared) };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace ws
+}  // namespace fmha
diff --git a/csrc/fmha_v2/fused_multihead_attention.cpp b/csrc/fmha_v2/fused_multihead_attention.cpp
new file mode 100644
index 0000000000..c6ebbdcfa1
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_attention.cpp
@@ -0,0 +1,1982 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <float.h>
+#include <fmha/hopper/tma_types.h>
+#include <fmha/paged_kv_cache.h>
+#include <fused_multihead_attention_api.h>
+#include <math.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+using Launch_params = bert::Fused_multihead_attention_launch_params;
+using Attention_mask_type = fmha::Attention_mask_type;
+using Attention_input_layout = fmha::Attention_input_layout;
+using Kv_block_array = fmha::Kv_block_array;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_softmax_fp32(void* dst, void const* src, void const* mask, void const* attention_sinks,
+                      void* softmax_sum_d, void* cu_q_seqlens_d, int s_inner, int s_outer, int b,
+                      int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_softmax_e4m3(void* dst, void const* src, void const* mask, void const* attention_sinks,
+                      void* softmax_sum_d, void* cu_q_seqlens_d, int s_inner, int s_outer, int b,
+                      int h, float scale_softmax, float softcapping_scale_bmm1, int warps_n,
+                      bool has_alibi);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_softmax_fp16(void* dst, void const* src, void const* mask, void const* attention_sinks,
+                      void* softmax_sum_d, void* cu_q_seqlens_d, int s_inner, int s_outer, int b,
+                      int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_softmax_bf16(void* dst, void const* src, void const* mask, void const* attention_sinks,
+                      void* softmax_sum_d, void* cu_q_seqlens_d, int s_inner, int s_outer, int b,
+                      int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_softmax_int8(void* dst, void const* src, void const* mask, void const* attention_sinks,
+                      void* softmax_sum_d, void* cu_q_seqlens_d, int s_inner, int s_outer, int b,
+                      int h, float scale_i2f, float scale_f2i, float softcapping_scale_bmm1,
+                      int warps_n, bool has_alibi);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_conversion_int32_to_int8(void* dst, void const* src, int s, int b, int h, int d,
+                                  float scale);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_conversion_fp32_to_fp16(void* dst, void const* src, int s, int b, int h, int d);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_conversion_fp32_to_bf16(void* dst, void const* src, int s, int b, int h, int d);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_conversion_fp32_to_e4m3(void* dst, void const* src, int s, int b, int h, int d,
+                                 float scale_o);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_sage_quant(unsigned int batch_size, unsigned int head_num, unsigned int head_size,
+                    unsigned int max_seq_len,
+                    // device var
+                    void const* q, void const* k, void const* v, int stride_q, int stride_k,
+                    int stride_v, int const* cu_seqlens_q, int const* cu_seqlens_kv,
+                    int block_size_q, int block_size_k, int block_size_v,
+                    // output
+                    void* quant_q, void* quant_k, void* quant_v, float* scales_q, float* scales_k,
+                    float* scales_v);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void ground_truth(RefBMM& bmm1, RefBMM& bmm2, const Data_type data_type, const Data_type acc_type,
+                  float const scale_bmm1, float const scale_softmax, float const scale_bmm2,
+                  float const softcapping_scale_bmm1, void* qkv_d, void* vt_d, void* mask_d,
+                  void* attention_sinks_d, void* p_d, void* s_d, void* tmp_d, void* o_d,
+                  void* softmax_sum_d, void* cu_q_seqlens_d, const size_t b, const size_t s,
+                  const size_t h, const size_t d, const size_t dv, int const runs,
+                  int const warps_m, int const warps_n, bool const has_alibi) {
+  cudaStream_t stream = 0;
+  // The stride between rows of the QKV matrix.
+  size_t qkv_stride = get_size_in_bytes(d, data_type);
+
+  // 1st GEMMd.
+  uint32_t alpha, beta = 0u;
+
+  for (int ii = 0; ii < runs; ++ii) {
+    // If we run the INT8 kernel, defer the scaling of P to softmax.
+    set_alpha(alpha, data_type == DATA_TYPE_INT8 ? 1.f : scale_bmm1, acc_type);
+    FMHA_CHECK_CUDA(cudaPeekAtLastError());
+    // P = Q x K'
+    bmm1(static_cast<char*>(qkv_d) + 0 * qkv_stride, static_cast<char*>(qkv_d) + 1 * qkv_stride,
+         p_d, &alpha, &beta, stream);
+
+    FMHA_CHECK_CUDA(cudaPeekAtLastError());
+    // Softmax.
+    printf("Running softmax\n");
+    if (data_type == DATA_TYPE_FP16 && acc_type == DATA_TYPE_FP16) {
+      run_softmax_fp16(s_d, p_d, mask_d, attention_sinks_d, softmax_sum_d, cu_q_seqlens_d, s, s, b,
+                       h, softcapping_scale_bmm1, warps_n, has_alibi);
+    } else if (data_type == DATA_TYPE_BF16 && acc_type == DATA_TYPE_FP32) {
+      run_softmax_bf16(s_d, p_d, mask_d, attention_sinks_d, softmax_sum_d, cu_q_seqlens_d, s, s, b,
+                       h, softcapping_scale_bmm1, warps_n, has_alibi);
+    } else if (data_type == DATA_TYPE_FP16 && acc_type == DATA_TYPE_FP32) {
+      run_softmax_fp32(s_d, p_d, mask_d, attention_sinks_d, softmax_sum_d, cu_q_seqlens_d, s, s, b,
+                       h, softcapping_scale_bmm1, warps_n, has_alibi);
+    } else if (data_type == DATA_TYPE_E4M3 && acc_type == DATA_TYPE_FP32) {
+      run_softmax_e4m3(s_d, p_d, mask_d, attention_sinks_d, softmax_sum_d, cu_q_seqlens_d, s, s, b,
+                       h, scale_softmax, softcapping_scale_bmm1, warps_n, has_alibi);
+    } else if (data_type == DATA_TYPE_INT8 && acc_type == DATA_TYPE_INT32) {
+      run_softmax_int8(s_d, p_d, mask_d, attention_sinks_d, softmax_sum_d, cu_q_seqlens_d, s, s, b,
+                       h, scale_bmm1, scale_softmax, softcapping_scale_bmm1, warps_n, has_alibi);
+    } else {
+      assert(false && "Reference Softmax: Unsupported type config");
+    }
+    FMHA_CHECK_CUDA(cudaPeekAtLastError());
+    // 2nd GEMM.
+    set_alpha(alpha, 1.f, acc_type);
+    FMHA_CHECK_CUDA(cudaPeekAtLastError());
+    void* out_d = o_d;
+
+    // We may have to do a final conversion.
+    if (data_type != acc_type) {
+      out_d = tmp_d;
+    }
+    // O = S x V
+    bmm2(static_cast<char*>(s_d),
+         static_cast<char*>(vt_d),  // static_cast<char *>(qkv_d) + 2 * qkv_stride,
+         out_d, &alpha, &beta, stream);
+    FMHA_CHECK_CUDA(cudaPeekAtLastError());
+    // Conversion to output type.
+    if (data_type == DATA_TYPE_FP16 && acc_type == DATA_TYPE_FP16) {
+      // Noop.
+    } else if (data_type == DATA_TYPE_FP16 && acc_type == DATA_TYPE_FP32) {
+      run_conversion_fp32_to_fp16(o_d, out_d, s, b, h, dv);
+    } else if (data_type == DATA_TYPE_BF16 && acc_type == DATA_TYPE_FP32) {
+      run_conversion_fp32_to_bf16(o_d, out_d, s, b, h, dv);
+    } else if (data_type == DATA_TYPE_E4M3 && acc_type == DATA_TYPE_FP32) {
+      run_conversion_fp32_to_e4m3(o_d, out_d, s, b, h, dv, scale_bmm2);
+    } else if (data_type == DATA_TYPE_INT8 && acc_type == DATA_TYPE_INT32) {
+      // quantize output in second step
+      run_conversion_int32_to_int8(o_d, out_d, s, b, h, dv, scale_bmm2);
+    }
+    FMHA_CHECK_CUDA(cudaPeekAtLastError());
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void set_params(bert::Fused_multihead_attention_params_v1& params,
+                              // types
+                              Data_type data_type, Data_type acc_type,
+                              // sizes
+                              const size_t b, const size_t s, const size_t h, const size_t d,
+                              const size_t packed_mask_stride,
+                              // device pointers
+                              void* qkv_d, void* packed_mask_d, void* o_d, void* p_d, void* s_d,
+                              // scale factors
+                              float const scale_bmm1, float const scale_softmax,
+                              float const scale_bmm2,
+                              // flags
+                              bool const has_alibi) {
+  memset(&params, 0, sizeof(params));
+
+  // Set the pointers.
+  params.qkv_ptr = qkv_d;
+  params.qkv_stride_in_bytes = get_size_in_bytes(b * h * 3 * d, data_type);
+  // params.qkv_stride_in_bytes = get_size_in_bytes(h * 3 * d, data_type);
+  params.packed_mask_ptr = packed_mask_d;
+  // params.packed_mask_stride_in_bytes = mmas_m * threads_per_cta * sizeof(uint32_t);
+  params.packed_mask_stride_in_bytes = packed_mask_stride * sizeof(uint32_t);
+  params.o_ptr = o_d;
+  params.o_stride_in_bytes = get_size_in_bytes(b * h * d, data_type);
+  params.has_alibi = has_alibi;
+  params.alibi_params = fmha::AlibiParams(h);
+
+#if defined(STORE_P)
+  params.p_ptr = p_d;
+  params.p_stride_in_bytes = get_size_in_bytes(b * h * s, acc_type);
+#endif  // defined(STORE_P)
+
+#if defined(STORE_S)
+  params.s_ptr = s_d;
+  params.s_stride_in_bytes = get_size_in_bytes(b * h * s, data_type);
+#endif  // defined(STORE_S)
+
+  // Set the dimensions.
+  params.b = b;
+  params.h = h;
+  params.s = s;
+  params.d = d;
+
+  // Set the different scale values.
+  Data_type scale_type1 =
+      (data_type == DATA_TYPE_FP16) || (data_type == DATA_TYPE_BF16) ? acc_type : DATA_TYPE_FP32;
+  Data_type scale_type2 =
+      (data_type == DATA_TYPE_FP16) || (data_type == DATA_TYPE_BF16) ? data_type : DATA_TYPE_FP32;
+
+  set_alpha(params.scale_bmm1, scale_bmm1, scale_type1);
+  set_alpha(params.scale_softmax, scale_softmax, scale_type1);
+  set_alpha(params.scale_bmm2, scale_bmm2, scale_type2);
+
+  // Do we enable the trick to replace I2F with FP math in the 2nd GEMM?
+  if (data_type == DATA_TYPE_INT8) {
+    params.enable_i2f_trick = -double(1 << 22) * double(scale_bmm2) <= -128.f &&
+                              double(1 << 22) * double(scale_bmm2) >= 127.f;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void set_params(bert::Fused_multihead_attention_params_v2& params,
+                              const Launch_params launch_params,
+                              // types
+                              Data_type data_type, Data_type acc_type, Data_type output_dtype,
+                              // attention input layout
+                              Attention_input_layout input_layout,
+                              // sizes
+                              const size_t b, const size_t s_q, const size_t s_kv, const size_t h,
+                              const size_t h_kv, const size_t d, const size_t dv,
+                              const size_t total, const size_t num_grouped_heads,
+                              const size_t sliding_window_size, const size_t chunked_attention_size,
+                              // paged kv cache block size.
+                              const size_t tokens_per_block,
+                              // device pointers
+                              void* qkv_packed_d,
+                              // contiguous q.
+                              void* q_d,
+                              // separate k.
+                              void* k_d,
+                              // separate v.
+                              void* v_d,
+                              // contiguous kv.
+                              void* kv_d,
+                              // start address of the paged kv pool.
+                              void* paged_kv_pool_ptr,
+                              // offsets for different blocks in terms of the start address.
+                              int32_t* paged_block_offsets,
+                              // mask input.
+                              void* packed_mask_d, void* cu_mask_rows_d,
+                              // attention sinks.
+                              void* attention_sinks_d, void* cu_kv_seqlens_d, void* cu_q_seqlens_d,
+                              void* o_packed_d, void* p_d, void* s_d, void* softmax_stats_d,
+                              void* scale_bmm2_d,
+                              // scale factors
+                              float const scale_bmm1, float const scale_softmax,
+                              float const scale_bmm2, float const softcapping_scale_bmm1,
+                              // flags
+                              bool const use_int8_scale_max, bool const interleaved,
+                              bool const is_s_padded, bool const has_alibi) {
+  memset(&params, 0, sizeof(params));
+
+  params.o_ptr = o_packed_d;
+  params.o_stride_in_bytes = get_size_in_bytes(h * dv, output_dtype);
+
+  if (interleaved) {
+    params.q_stride_in_bytes = total;
+    params.o_stride_in_bytes = total;
+  }
+
+  if (input_layout == Attention_input_layout::PACKED_QKV) {
+    // For grouped- or multi-query attention (h denotes num_q_heads; h' denotes h_kv):
+    //   qkv_layout = [b, s, [q_hd, k_h'd, v_h'd]]
+    //   qkv_stride = (h+2*h')d * bytes_per_elt
+    // Otherwise:
+    //   qkv_layout = [b, s, 3, h, d] or [b, s, h, 3, d]
+    //   qkv_stride = 3hd * bytes_per_elt
+    params.qkv_ptr = qkv_packed_d;
+    params.q_stride_in_bytes = params.k_stride_in_bytes = params.v_stride_in_bytes =
+        get_size_in_bytes(h * d + h_kv * d + h_kv * dv, data_type);
+  } else {
+    // Layout [B, S, H, D].
+    params.q_ptr = q_d;
+    params.q_stride_in_bytes = get_size_in_bytes(h * d, data_type);
+
+    if (input_layout == Attention_input_layout::CONTIGUOUS_Q_KV) {
+      // Layout [B, S, 2, H, D].
+      params.kv_ptr = kv_d;
+      params.k_stride_in_bytes = params.v_stride_in_bytes =
+          get_size_in_bytes(h_kv * (d + dv), data_type);
+    } else if (input_layout == Attention_input_layout::Q_PAGED_KV) {
+      int max_blocks_per_sequence = (s_kv + tokens_per_block - 1) / tokens_per_block;
+      params.paged_kv_cache =
+          Kv_block_array(b, max_blocks_per_sequence, tokens_per_block,
+                         get_size_in_bytes(tokens_per_block * h_kv * std::gcd(d, dv), data_type),
+                         paged_kv_pool_ptr);
+      params.paged_kv_cache.mBlockOffsets = paged_block_offsets;
+      params.k_stride_in_bytes = get_size_in_bytes(tokens_per_block * d, data_type);
+      params.v_stride_in_bytes = get_size_in_bytes(tokens_per_block * dv, data_type);
+    } else if (input_layout == Attention_input_layout::SEPARATE_Q_K_V) {
+      // Layout [B, S, H_kv, D].
+      params.k_ptr = k_d;
+      // Layout [B, S, H_kv, Dv].
+      params.v_ptr = v_d;
+      params.k_stride_in_bytes = get_size_in_bytes(h_kv * d, data_type);
+      params.v_stride_in_bytes = get_size_in_bytes(h_kv * dv, data_type);
+    }
+  }
+
+  // Packed mask.
+  params.packed_mask_ptr = packed_mask_d;
+  // The N dimension has to be aligned.
+  params.packed_mask_stride_in_bytes =
+      (align_to(int64_t(s_kv), int64_t(fmha::FLASH_ATTEN_MASK_N_ALIGNMENT))) / 8;
+
+  // Attention sinks.
+  params.attention_sinks = reinterpret_cast<float*>(attention_sinks_d);
+
+#if defined(STORE_P)
+  params.p_ptr = p_d;
+  params.p_stride_in_bytes = get_size_in_bytes(b * h * s_kv, acc_type);
+#endif  // defined(STORE_P)
+
+#if defined(STORE_S)
+  params.s_ptr = s_d;
+  params.s_stride_in_bytes = get_size_in_bytes(b * h * s_kv, data_type);
+#endif  // defined(STORE_S)
+
+  params.softmax_stats_ptr = softmax_stats_d;
+  params.softmax_stats_stride_in_bytes = get_size_in_bytes(h * 2, DATA_TYPE_FP32);
+
+  // Set the dimensions.
+  params.b = b;
+  params.h = h;
+  params.s = s_q;
+  params.d = d;
+  params.dv = dv;
+  params.num_grouped_heads = num_grouped_heads;
+  params.sliding_window_size = sliding_window_size;
+  assert((chunked_attention_size == 0 ||
+          (chunked_attention_size & (chunked_attention_size - 1)) == 0) &&
+         "chunked_attention_size has to be a power of 2");
+  params.log2_chunked_attention_size =
+      chunked_attention_size > 0 ? std::log2(chunked_attention_size) : 0;
+
+  // cumulative q or kv sequence lengths.
+  params.cu_q_seqlens = static_cast<int*>(cu_q_seqlens_d);
+  params.cu_kv_seqlens = static_cast<int*>(cu_kv_seqlens_d);
+  // cumulative mask sequence lengths.
+  params.cu_mask_rows = static_cast<int*>(cu_mask_rows_d);
+
+  // Set the different scale values.
+  Data_type scale_type1 =
+      (data_type == DATA_TYPE_FP16) || (data_type == DATA_TYPE_BF16) ? acc_type : DATA_TYPE_FP32;
+  Data_type scale_softmax_type = scale_type1;
+  Data_type scale_type2 =
+      (data_type == DATA_TYPE_FP16) || (data_type == DATA_TYPE_BF16) ? data_type : DATA_TYPE_FP32;
+  if (data_type == DATA_TYPE_E4M3) {
+    scale_type1 = acc_type;
+    scale_type2 = acc_type;
+  }
+
+  // Fuse 1.0f / softcapping_scale into scale_bmm1.
+  bool const enable_attn_logit_softcapping = softcapping_scale_bmm1 != 0.f;
+  float fused_scale_bmm1 =
+      enable_attn_logit_softcapping ? scale_bmm1 / softcapping_scale_bmm1 : scale_bmm1;
+
+  // use specialized hopper kernels without alibi support.
+  // alibi or softcapping_scale cannot utilize the exp2f with fused_scale optimization.
+  if (launch_params.warp_specialization && !has_alibi && !enable_attn_logit_softcapping) {
+    set_alpha(params.scale_bmm1, fused_scale_bmm1 * float(M_LOG2E), DATA_TYPE_FP32);
+  } else {
+    set_alpha(params.scale_bmm1, fused_scale_bmm1, scale_type1);
+  }
+  set_alpha(params.scale_softmax, scale_softmax, scale_softmax_type);
+  set_alpha(params.scale_bmm2, scale_bmm2, scale_type2);
+  params.scale_bmm2_d = reinterpret_cast<uint32_t*>(scale_bmm2_d);
+  params.softcapping_scale_bmm1 = softcapping_scale_bmm1;
+
+  FMHA_CHECK_CUDA(cudaMemcpy(params.scale_bmm2_d, &params.scale_bmm2, sizeof(uint32_t),
+                             cudaMemcpyHostToDevice));
+
+  // attention type, h_kv < h if MQA or GQA
+  params.h_kv = h_kv;
+  assert(h % h_kv == 0 && "MQA/GQA needs h to be divisible by h_kv!");
+  params.h_q_per_kv = h / h_kv;
+  params.has_alibi = has_alibi;
+  params.alibi_params = fmha::AlibiParams(h);
+
+  // Set flags
+  params.is_s_padded = is_s_padded;
+  params.use_int8_scale_max = use_int8_scale_max;
+
+  // Do we enable the trick to replace I2F with FP math in the 2nd GEMM?
+  if (data_type == DATA_TYPE_INT8) {
+    params.enable_i2f_trick = -double(1 << 22) * double(scale_bmm2) <= -128.f &&
+                              double(1 << 22) * double(scale_bmm2) >= 127.f;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void determine_launch_params(
+    Launch_params& launch_params, Data_type data_type, int sm, const size_t s, const size_t d,
+    const Attention_mask_type attention_mask_type, const Attention_input_layout input_layout,
+    bool const interleaved, bool const ignore_b1opt, bool const force_unroll, bool const use_tma,
+    bool const force_non_flash_attention, bool const force_non_warp_specialization,
+    bool const force_non_granular_tiling, bool const force_fp32_acc,
+    // device props
+    const cudaDeviceProp props) {
+  // Set launch params to choose kernels
+  launch_params.ignore_b1opt = ignore_b1opt;
+  launch_params.force_unroll = force_unroll;
+  launch_params.force_fp32_acc = force_fp32_acc;
+  launch_params.interleaved = interleaved;
+  launch_params.attention_mask_type = attention_mask_type;
+  launch_params.attention_input_layout = input_layout;
+
+  // Set SM count and L2 cache size (used to determine launch blocks/grids to maximum performance)
+  launch_params.multi_processor_count = props.multiProcessorCount;
+  launch_params.device_l2_cache_size = props.l2CacheSize;
+
+  // threshold for adopting flash attention or warp_specialized kernels.
+  launch_params.flash_attention =
+      (data_type == DATA_TYPE_FP16 || data_type == DATA_TYPE_BF16 || data_type == DATA_TYPE_E4M3) &&
+      (s >= 16 && d >= 16) && !force_non_flash_attention;
+
+  // enable warp_speialized kernels when s >= 512 on hopper
+  // note that warp_speialized kernels need flash attention + tma
+  launch_params.warp_specialization =
+      (data_type == DATA_TYPE_FP16 || data_type == DATA_TYPE_BF16 || data_type == DATA_TYPE_E4M3) &&
+      sm == 90 && launch_params.flash_attention && !force_non_warp_specialization;
+  // warp specialization kernels on hopper need tma
+  launch_params.use_tma = use_tma || launch_params.warp_specialization;
+
+  // use granular tiling on Ampere-style flash attention
+  launch_params.use_granular_tiling = !force_non_granular_tiling && launch_params.flash_attention &&
+                                      !launch_params.warp_specialization && sm >= 80;
+
+  if (launch_params.use_granular_tiling && (data_type == DATA_TYPE_E4M3 && sm == 80)) {
+    printf(
+        "Fallback to non-granular-tiling kernels as tiled e4m3 kernels"
+        "are not supported on Ada currently.\n");
+    launch_params.use_granular_tiling = false;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char** argv) {
+  // The device. Reset on destruction
+  CudaDevice device;
+  int sm = device.sm;
+  cudaDeviceProp props = device.props;
+
+  GpuTimer timer;
+
+  // The batch size.
+  size_t b = 128;
+  // The number of heads.
+  size_t h = 16;
+  // The dimension of the Q, K and V vectors.
+  size_t d = 64;
+  // The dimension of V if set to non-zero, otherwise dimension of V equals to that of Q
+  size_t dv = 0;
+  // The length of the sequence.
+  size_t s = 384;
+  // Number of grouped heads in the seqlen dimension.
+  size_t num_grouped_heads = 1;
+  // Sliding Window Attention
+  // Only pay attention to [max(0, query_idx - sliding_window_size), query_idx].
+  size_t sliding_window_size = size_t(INT_MAX);
+  // The chunked-attention size.
+  size_t chunked_attention_size = 0;
+
+  // The data type of the kernel.
+  Data_type data_type = DATA_TYPE_FP16;
+  // The type of the intermediate P matrix.
+  Data_type acc_type = DATA_TYPE_FP16;
+  // The type of the output.
+  Data_type output_dtype = DATA_TYPE_FP16;
+  // Is the output type set ?
+  bool is_output_dtype_set = false;
+
+  // The scaling factors.
+  float scale_bmm1 = 0.f, scale_softmax = 0.f, scale_bmm2 = 0.25f;
+  // The number of runs.
+  int runs = 1, warm_up_runs = 0;
+  // Do we use 1s for Q, K, V.
+  bool use_1s_q = false, use_1s_k = false, use_1s_v = false;
+  // The range of the different inputs.
+  int range_q = 5, range_k = 3, range_v = 5;
+  // The scale.
+  float scale_q = 0.f, scale_k = 0.f, scale_v = 0.f;
+  // The threshold for dropout. By default, drop 10%.
+  float dropout = 0.1f;
+  // Do we skip the checks.
+  bool skip_checks = false;
+  // The tolerance when checking results.
+  float epsilon = -1.f;  // data_type == DATA_TYPE_FP16 ? 0.015f : 0.f;
+  // Use causal mask / padding_mask / sliding_or_chunked_causal mask / custom_mask input.
+  Attention_mask_type attention_mask_type = Attention_mask_type::PADDING;
+  // Use padded format for input QKV tensor & output O tensor.
+  // Instead of variable lengths [total, h, 3, d]  where total = b1*s1 + b2*s2 + ... bn*sn,
+  // use padded length [b, max_s, h, 3, d]         where max_s is the maximum expected seq len
+  bool is_s_padded = false;
+
+  // minimum sequence length for sampling variable seqlens
+  uint32_t min_s = -1;
+
+  // run interleaved kernels and transpose input and output accordingly
+  bool interleaved = false;
+  bool ignore_b1opt = false;
+  bool force_unroll = false;
+  // used by kernels that have different acc data types (like hmma, qmma)
+  bool force_fp32_acc = false;
+  bool force_non_flash_attention = false;
+  // enable warp specialization kernels on sm 90
+  bool force_non_warp_specialization = (sm != 90);
+  bool use_int8_scale_max = false;
+  bool verbose = true;
+  bool save_softmax = false;
+
+  // use granular tiling
+  // supported only by Ampere-based Flash Attention at this moment
+  bool force_non_granular_tiling = false;
+
+  // set all sequence lengths to min(s, min_s)
+  bool fix_s = false;
+
+  bool v1 = false;
+
+  // use TMA or not. ignored if not in SM90
+  bool use_tma = false;
+
+  // use alibi.
+  bool has_alibi = false;
+
+  // Use softcapping_scale_bmm1 (scale * __tanhf(x / scale)).
+  float softcapping_scale_bmm1 = 0.f;
+
+  // In multi-query or grouped-query attention (MQA/GQA), several Q heads are associated with one KV
+  // head
+  bool multi_query_attention = false;
+  size_t h_kv = 0;
+
+  // The attention input layout.
+  Attention_input_layout input_layout = Attention_input_layout::PACKED_QKV;
+
+  // TRTLLM uses 64 by default in paged kv cache.
+  size_t tokens_per_block = 64;
+
+  // Attention that has different q and kv lengths.
+  size_t s_q = 0;
+  // different q and kv sequence lengths.
+  bool different_q_kv_lengths = false;
+
+  // SageAttention block sizes
+  int sage_block_size_q = 0, sage_block_size_k = 0, sage_block_size_v = 0;
+
+  // Use attention sinks (added to the denominator of softmax)
+  bool use_attention_sinks = false;
+
+  // Read the parameters from the command-line.
+  for (int ii = 1; ii < argc; ++ii) {
+    if (!strcmp(argv[ii], "-1s")) {
+      use_1s_k = use_1s_q = use_1s_v = true;
+    } else if (!strcmp(argv[ii], "-1s-k")) {
+      use_1s_k = true;
+    } else if (!strcmp(argv[ii], "-1s-q")) {
+      use_1s_q = true;
+    } else if (!strcmp(argv[ii], "-1s-v")) {
+      use_1s_v = true;
+    } else if (!strcmp(argv[ii], "-b") && ++ii < argc) {
+      b = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-d") && ++ii < argc) {
+      d = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-dv") && ++ii < argc) {
+      dv = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-s-q") && ++ii < argc) {
+      s_q = strtol(argv[ii], nullptr, 10);
+      different_q_kv_lengths = true;
+    } else if (!strcmp(argv[ii], "-dropout") && ++ii < argc) {
+      dropout = (float)strtod(argv[ii], nullptr);
+    } else if (!strcmp(argv[ii], "-epsilon") && ++ii < argc) {
+      epsilon = (float)strtod(argv[ii], nullptr);
+    } else if (!strcmp(argv[ii], "-h") && ++ii < argc) {
+      h = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-int8")) {
+      data_type = DATA_TYPE_INT8;
+      acc_type = DATA_TYPE_INT32;
+    } else if (!strcmp(argv[ii], "-fp16")) {
+      data_type = DATA_TYPE_FP16;
+      acc_type = DATA_TYPE_FP16;
+    } else if (!strcmp(argv[ii], "-fp16-fp32")) {
+      data_type = DATA_TYPE_FP16;
+      acc_type = DATA_TYPE_FP32;
+      force_fp32_acc = true;
+    } else if (!strcmp(argv[ii], "-bf16")) {
+      data_type = DATA_TYPE_BF16;
+      acc_type = DATA_TYPE_FP32;
+      force_fp32_acc = true;
+    } else if (!strcmp(argv[ii], "-e4m3")) {
+      data_type = DATA_TYPE_E4M3;
+      // Technically not the acc type.
+      acc_type = DATA_TYPE_FP32;
+      force_fp32_acc = true;
+    } else if (!strcmp(argv[ii], "-e4m3-fp16")) {  // Ada QMMA only
+      data_type = DATA_TYPE_E4M3;
+      // Technically not the acc type.
+      acc_type = DATA_TYPE_FP16;
+    } else if (!strcmp(argv[ii], "-e4m3-fp32")) {
+      data_type = DATA_TYPE_E4M3;
+      // Technically not the acc type.
+      acc_type = DATA_TYPE_FP32;
+      force_fp32_acc = true;
+    } else if (!strcmp(argv[ii], "-fp16-output")) {
+      output_dtype = DATA_TYPE_FP16;
+      is_output_dtype_set = true;
+    } else if (!strcmp(argv[ii], "-bf16-output")) {
+      output_dtype = DATA_TYPE_BF16;
+      is_output_dtype_set = true;
+    } else if (!strcmp(argv[ii], "-num-grouped-heads") && ++ii < argc) {
+      num_grouped_heads = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-range-k") && ++ii < argc) {
+      range_k = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-range-q") && ++ii < argc) {
+      range_q = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-range-v") && ++ii < argc) {
+      range_v = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-runs") && ++ii < argc) {
+      runs = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-s") && ++ii < argc) {
+      s = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-sliding-window-size") && ++ii < argc) {
+      sliding_window_size = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-chunked-attention-size") && ++ii < argc) {
+      chunked_attention_size = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-scale-bmm1") && ++ii < argc) {
+      scale_bmm1 = (float)strtod(argv[ii], nullptr);
+    } else if (!strcmp(argv[ii], "-scale-bmm2") && ++ii < argc) {
+      scale_bmm2 = (float)strtod(argv[ii], nullptr);
+    } else if (!strcmp(argv[ii], "-scale-k") && ++ii < argc) {
+      scale_k = (float)strtod(argv[ii], nullptr);
+    } else if (!strcmp(argv[ii], "-scale-softmax") && ++ii < argc) {
+      scale_softmax = (float)strtod(argv[ii], nullptr);
+    } else if (!strcmp(argv[ii], "-scale-q") && ++ii < argc) {
+      scale_q = (float)strtod(argv[ii], nullptr);
+    } else if (!strcmp(argv[ii], "-scale-v") && ++ii < argc) {
+      scale_v = (float)strtod(argv[ii], nullptr);
+    } else if (!strcmp(argv[ii], "-skip-checks")) {
+      skip_checks = true;
+    } else if (!strcmp(argv[ii], "-warm-up-runs") && ++ii < argc) {
+      warm_up_runs = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-min-s") && ++ii < argc) {
+      min_s = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-il")) {
+      interleaved = true;
+    } else if (!strcmp(argv[ii], "-causal-mask")) {
+      attention_mask_type = Attention_mask_type::CAUSAL;
+    } else if (!strcmp(argv[ii], "-sliding-or-chunked-causal-mask")) {
+      attention_mask_type = Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL;
+    } else if (!strcmp(argv[ii], "-custom-mask")) {
+      attention_mask_type = Attention_mask_type::CUSTOM_MASK;
+    } else if (!strcmp(argv[ii], "-multi-query-attention") || !strcmp(argv[ii], "-mqa")) {
+      h_kv = 1;
+      multi_query_attention = true;  // subset of GQA
+    } else if ((!strcmp(argv[ii], "-grouped-query-attention") || !strcmp(argv[ii], "-gqa")) &&
+               ++ii < argc) {
+      h_kv = strtol(argv[ii], nullptr, 10);
+      multi_query_attention = true;
+    } else if (!strcmp(argv[ii], "-contiguous-q-kv")) {
+      input_layout = Attention_input_layout::CONTIGUOUS_Q_KV;
+    } else if (!strcmp(argv[ii], "-paged-kv")) {
+      input_layout = Attention_input_layout::Q_PAGED_KV;
+    } else if (!strcmp(argv[ii], "-separate-q-k-v")) {
+      input_layout = Attention_input_layout::SEPARATE_Q_K_V;
+    } else if (!strcmp(argv[ii], "-tokens-per-block") && ++ii < argc) {
+      tokens_per_block = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-pad-s")) {
+      is_s_padded = true;
+    } else if (!strcmp(argv[ii], "-ignore-b1opt")) {
+      ignore_b1opt = true;
+    } else if (!strcmp(argv[ii], "-force-unroll")) {
+      force_unroll = true;
+    } else if (!strcmp(argv[ii], "-force-non-flash-attention")) {
+      force_non_flash_attention = true;
+      force_non_warp_specialization = true;
+    } else if (!strcmp(argv[ii], "-force-flash-attention")) {
+      fprintf(stderr,
+              "Deprecation warning: -force-flash-attention is no longer valid; use "
+              "-force-non-flash-attention instead, as Flash Attention is enabled by default.\n");
+    } else if (!strcmp(argv[ii], "-force-non-warp-specialization")) {
+      force_non_warp_specialization = true;
+    } else if (!strcmp(argv[ii], "-force-non-granular-tiling") ||
+               !strcmp(argv[ii], "-force-non-tiled")) {
+      force_non_granular_tiling = true;
+    } else if (!strcmp(argv[ii], "-fix-s")) {
+      fix_s = true;
+    } else if (!strcmp(argv[ii], "-scale-max")) {
+      use_int8_scale_max = true;
+    } else if (!strcmp(argv[ii], "-v") && ++ii < argc) {
+      int v = strtol(argv[ii], nullptr, 10);
+      verbose = v != 0;
+    } else if (!strcmp(argv[ii], "-v1")) {
+      v1 = true;
+    } else if (!strcmp(argv[ii], "-use-tma")) {
+      use_tma = true;
+      // flash attention + tma + non_warp_specialized kernels are not supported
+      // use non_flash_attention + tma + non_warp_specialized instead
+      if (force_non_warp_specialization) {
+        force_non_flash_attention = true;
+      }
+    } else if (!strcmp(argv[ii], "-alibi")) {
+      has_alibi = true;
+    } else if (!strcmp(argv[ii], "-softcapping-scale-bmm1") && ++ii < argc) {
+      softcapping_scale_bmm1 = (float)strtod(argv[ii], nullptr);
+    } else if (!strcmp(argv[ii], "-save-softmax")) {
+      save_softmax = true;
+    } else if (!strcmp(argv[ii], "-sage-block-q") && ++ii < argc) {
+      sage_block_size_q = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-sage-block-k") && ++ii < argc) {
+      sage_block_size_k = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-sage-block-v") && ++ii < argc) {
+      sage_block_size_v = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-use-attention-sinks")) {
+      use_attention_sinks = true;
+    } else {
+      fprintf(stderr, "Unrecognized option: %s. Aborting!\n", argv[ii]);
+      return -1;
+    }
+  }
+  if (save_softmax == true) {
+    bool is_MLA = (d == 192 && dv == 128);
+    if (((!is_MLA) && input_layout != Attention_input_layout::CONTIGUOUS_Q_KV) ||
+        (is_MLA && input_layout != Attention_input_layout::SEPARATE_Q_K_V)) {
+      fprintf(stderr,
+              "For normal attention, Only '--contiguous-q-kv' layout supports "
+              "'-save-softmax'. For MLA only '-separate-q-k-v' layout supports "
+              "'-save-softmax'.\n");
+      exit(1);
+    }
+  }
+  // Sanitize
+  if (min_s == -1) min_s = s;
+  min_s = std::min<uint32_t>(s, min_s);
+  h_kv = multi_query_attention ? h_kv : h;
+
+  // Check if the options are valid.
+  if (different_q_kv_lengths) {
+    assert(input_layout != Attention_input_layout::PACKED_QKV &&
+           "Packed QKV input layout is not supported with different q and kv lengths.");
+    assert(s >= s_q && "q seqlen has to be smaller than or equal to the kv seqlen !");
+  } else {
+    s_q = s;
+  }
+
+  // Sliding window attention (only pay attention to sliding-window-size long previous tokens).
+  if (sliding_window_size < s) {
+    assert(chunked_attention_size == 0 &&
+           "chunked_attention_size should not be used when sliding_window_size is set");
+    attention_mask_type = Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL;
+  }
+  // Chunked attention.
+  if (chunked_attention_size > 0) {
+    assert((chunked_attention_size & (chunked_attention_size - 1)) == 0 &&
+           "chunked_attention_size has to be a power of 2");
+    attention_mask_type = Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL;
+  }
+
+  // Set the norm.
+  if (scale_bmm1 == 0.f) {
+    scale_bmm1 = 1.f / sqrtf((float)d);
+  }
+
+  // Set the output type if not set by user.
+  if (!is_output_dtype_set) {
+    output_dtype = data_type;
+  }
+
+  // Force the softmax scale to 1.f for the FP16 kernel.
+  if (data_type == DATA_TYPE_FP16) {
+    scale_softmax = 1.f;
+  } else if (data_type == DATA_TYPE_INT8 && scale_softmax == 0.f) {
+    scale_softmax = std::max(512.f, (float)s);
+  } else if (data_type == DATA_TYPE_E4M3 && scale_softmax == 0.f) {
+    scale_softmax = 1.f;  // For E4M3 this is hardcoded as the largest power-of-2 below E4M3_MAX
+  }
+
+  // Sage Attention uses the e4m3 data type
+  if (sage_block_size_q > 0 || sage_block_size_k > 0 || sage_block_size_v > 0) {
+    scale_softmax = 1.f;
+    scale_bmm2 = 1.f;
+    force_fp32_acc = true;
+    acc_type = DATA_TYPE_FP32;
+  }
+
+  // Define the scaling factor for the different inputs.
+  if (scale_q == 0.f) {
+    scale_q = 1.f;
+  }
+  if (scale_k == 0.f) {
+    scale_k = 1.f;
+  }
+  if (scale_v == 0.f) {
+    // BF16 here just for debug.
+    scale_v = (data_type == DATA_TYPE_FP16 || data_type == DATA_TYPE_BF16) ? 0.125f : 1.f;
+  }
+  if (has_alibi && attention_mask_type == Attention_mask_type::PADDING) {
+    attention_mask_type = Attention_mask_type::CAUSAL;
+  }
+
+  // BF16 only support FP32 acc_type.
+  if (data_type == DATA_TYPE_BF16 && acc_type != DATA_TYPE_FP32) {
+    fprintf(stderr, "Only FP32 accumulation is supported for BF16 I/O\n");
+    exit(1);
+  }
+
+  // Set the tolerance if not already set by the user.
+  if (epsilon < 0.f) {
+    switch (data_type) {
+      case DATA_TYPE_FP16:
+        epsilon = 0.015f;
+        break;
+      case DATA_TYPE_BF16:
+        epsilon = 0.025f;
+        break;
+      case DATA_TYPE_E4M3:
+        epsilon = 0.15f;
+        break;
+      default:
+        epsilon = 0.f;
+    }
+    // the accuracy of SageAttention may be between fp8 and fp16/bf16 ?
+    if (sage_block_size_q > 0 || sage_block_size_k > 0 || sage_block_size_v > 0) {
+      epsilon = 0.05f;
+    }
+  }
+
+  // let the dimension of V equal to that of Q if not set by user
+  if (dv == 0) {
+    dv = d;
+  }
+
+  // Debug info -- only in verbose mode.
+  if (verbose) {
+    // Running the following command.
+    printf("Command.......: %s", argv[0]);
+    for (int ii = 1; ii < argc; ++ii) {
+      printf(" %s", argv[ii]);
+    }
+    printf("\n");
+
+    // Device info.
+    printf("Device........: %s\n", props.name);
+    printf("Arch.(sm).....: %d\n", sm);
+    printf("#.of.SMs......: %d\n", props.multiProcessorCount);
+
+    // Problem info.
+    printf("Batch ........: %lu\n", b);
+    printf("Heads ........: %lu\n", h);
+    printf("Dimension ....: %lu\n", d);
+    printf("Dimension of V ....: %lu\n", dv);
+    printf("Seq length ...: %lu\n", s);
+    printf("Warm-up runs .: %d\n", warm_up_runs);
+    printf("Runs..........: %d\n\n", runs);
+
+    // The scaling factors for the 3 operations.
+    printf("Scale bmm1 ...: %.6f\n", scale_bmm1);
+    printf("Scale softmax.: %.6f\n", scale_softmax);
+    printf("Scale bmm2 ...: %.6f\n", scale_bmm2);
+    printf("\n");
+  }
+
+  // determine the launch params to select kernels
+  Launch_params launch_params;
+  determine_launch_params(launch_params, data_type, sm, s, d, attention_mask_type, input_layout,
+                          interleaved, ignore_b1opt, force_unroll, use_tma,
+                          force_non_flash_attention, force_non_warp_specialization,
+                          force_non_granular_tiling, force_fp32_acc, props);
+
+  // The Q, K and V matrices are packed into one big matrix of size S x B x H x 3 x D.
+  const size_t qkv_size = s * b * h * (2 * d + dv);
+  // Allocate on the host.
+  float* qkv_h = (float*)malloc(qkv_size * sizeof(float));
+  // The size in bytes.
+  const size_t qkv_size_in_bytes = get_size_in_bytes(qkv_size, data_type);
+  // Allocate on the device.
+  void *qkv_sbh3d_d = nullptr, *qkv_bsh3d_d = nullptr;
+  FMHA_CHECK_CUDA(cudaMalloc(&qkv_sbh3d_d, qkv_size_in_bytes));
+  FMHA_CHECK_CUDA(cudaMalloc(&qkv_bsh3d_d, qkv_size_in_bytes));
+
+  // Contiguous KV cache buffer.
+  // The shape is [B, 2, S, H, D].
+  const size_t kv_size = b * s * h_kv * (d + dv);
+  // The size in bytes.
+  const size_t kv_size_in_bytes = get_size_in_bytes(kv_size, data_type);
+  // Allocate on the host.
+  void* contiguous_kv_h = malloc(kv_size_in_bytes);
+  // Memset the buffer.
+  memset(contiguous_kv_h, 0, kv_size_in_bytes);
+  // Allocate on the device.
+  void* contiguous_kv_d;
+  FMHA_CHECK_CUDA(cudaMalloc(&contiguous_kv_d, kv_size_in_bytes));
+
+  // Paged KV Cache buffer.
+  // The shape is [B, 2, Blocks_per_sequence], and each block's buffer shape is [H,
+  // Tokens_per_block, Dh].
+  void** kv_cache_ptrs_h = nullptr;
+  void* kv_cache_pool_ptr = nullptr;
+  int32_t *kv_cache_block_offsets_h, *kv_cache_block_offsets_d = nullptr;
+  const size_t max_blocks_per_seq = (s + tokens_per_block - 1) / tokens_per_block;
+  const size_t num_total_blocks = b * 2 * max_blocks_per_seq;
+  kv_cache_ptrs_h = (void**)malloc(num_total_blocks * sizeof(void*));
+  kv_cache_block_offsets_h = (int32_t*)malloc(num_total_blocks * sizeof(int32_t));
+  const size_t paged_kv_block_size_in_bytes =
+      get_size_in_bytes(tokens_per_block * h_kv * std::gcd(d, dv), data_type);
+  FMHA_CHECK_CUDA(
+      cudaMalloc((void**)(&kv_cache_block_offsets_d), num_total_blocks * sizeof(int32_t)));
+  const size_t kv_cache_pool_sz =
+      get_size_in_bytes(num_total_blocks * tokens_per_block * h_kv * (d + dv) / 2, data_type);
+  FMHA_CHECK_CUDA(cudaMalloc((void**)(&kv_cache_pool_ptr), kv_cache_pool_sz));
+  size_t ptr_index = 0;
+  size_t abs_offset = 0;
+  for (size_t bi = 0; bi < b; bi++) {
+    for (int kv_offset = 0; kv_offset < 2; kv_offset++) {
+      size_t block_size =
+          get_size_in_bytes(tokens_per_block * h_kv * (kv_offset == 0 ? d : dv), data_type);
+      for (size_t block_i = 0; block_i < max_blocks_per_seq; block_i++) {
+        kv_cache_ptrs_h[ptr_index] =
+            reinterpret_cast<void*>(reinterpret_cast<char*>(kv_cache_pool_ptr) + abs_offset);
+        assert(abs_offset % paged_kv_block_size_in_bytes == 0);
+        kv_cache_block_offsets_h[ptr_index] = abs_offset / paged_kv_block_size_in_bytes;
+        ptr_index++;
+        abs_offset += block_size;
+      }
+    }
+  }
+  assert(ptr_index == num_total_blocks && abs_offset == kv_cache_pool_sz);
+  FMHA_CHECK_CUDA(cudaMemcpy(kv_cache_block_offsets_d, kv_cache_block_offsets_h,
+                             num_total_blocks * sizeof(int32_t), cudaMemcpyDefault));
+
+  // Q will always be [B, S, H, Dh] with paged kv cache.
+  void* q_d;
+  const size_t q_size = s * b * h * d;
+  FMHA_CHECK_CUDA(cudaMalloc(&q_d, get_size_in_bytes(q_size, data_type)));
+
+  // K has [B, S, H_kv, D] with separate kv cache.
+  void* k_d;
+  const size_t k_size = s * b * h_kv * d;
+  FMHA_CHECK_CUDA(cudaMalloc(&k_d, get_size_in_bytes(k_size, data_type)));
+
+  // V has [B, S, H_kv, Dv] with separate kv cache.
+  void* v_d;
+  const size_t v_size = s * b * h_kv * dv;
+  FMHA_CHECK_CUDA(cudaMalloc(&v_d, get_size_in_bytes(v_size, data_type)));
+
+  // Scale bmm2 (per-tensor).
+  void* scale_bmm2_d;
+  FMHA_CHECK_CUDA(cudaMalloc(&scale_bmm2_d, sizeof(uint32_t)));
+
+  // The mask for dropout or any mask patterns.
+  const size_t mask_size = s * b * s;
+  // Allocate on the host.
+  float* mask_h = (float*)malloc(mask_size * sizeof(float));
+  // The size in bytes.
+  const size_t mask_size_in_bytes = get_size_in_bytes(mask_size, DATA_TYPE_INT8);
+  // Allocate on the device.
+  void* mask_d = nullptr;
+  if (!skip_checks) {
+    FMHA_CHECK_CUDA(cudaMalloc(&mask_d, mask_size_in_bytes));
+  }
+
+  // The decomposition of threads and warps for BMM1.
+  size_t warps_m, warps_n, warps_k;
+  std::tie(warps_m, warps_n, warps_k) =
+      get_warps(launch_params, sm, data_type, s, b, d, v1 ? 1 : 2);
+
+  // print launch configuration
+  printf(
+      "v1=%d il=%d s_q=%lu, s=%lu b=%lu h=%lu/%lu d=%lu/%lu dtype=%s, output_dtype=%s, "
+      "flash_attn=%s, "
+      "warp_spec=%s, mask=%s, "
+      "alibi=%s, attn=%s, qkv_layout=%s, wm=%lu wn=%lu\n",
+      v1, interleaved, s_q, s, b, h, h_kv, d, dv, data_type_to_name(data_type).c_str(),
+      data_type_to_name(output_dtype).c_str(),
+      launch_params.flash_attention ? (launch_params.use_granular_tiling ? "true_tiled" : "true")
+                                    : "false",
+      launch_params.warp_specialization ? "true" : "false",
+      mask_type_to_string(attention_mask_type).c_str(), has_alibi ? "true" : "false",
+      h_kv == 1 ? "mqa" : (h_kv == h ? "mha" : "gqa"),
+      attention_input_layout_to_string(input_layout).c_str(), warps_m, warps_n);
+
+  // For multi-CTA cases, determine the size of the CTA wave.
+  int heads_per_wave, ctas_per_head;
+  get_grid_size(heads_per_wave, ctas_per_head, sm, data_type, b, s, h, d,
+                false,  // disable multi-cta kernels by default
+                v1 ? 1 : 2);
+
+  // The number of threads per CTA.
+  const size_t threads_per_cta = warps_m * warps_n * warps_k * 32;
+  // The number of mmas in the M dimension. We use one uint32_t per MMA in the M dimension.
+  size_t mmas_m = (s + 16 * warps_m - 1) / (16 * warps_m);
+  // The number of mmas in the N dimension.
+  size_t mmas_n = (s + 16 * warps_n - 1) / (16 * warps_n);
+  // We do not support more than 4 MMAS in the N dimension (as each MMA needs 8 bits in the mask).
+  assert(!v1 || mmas_n <= 4);
+  // The packed mask for dropout (in the fused kernel). Layout is B * MMAS_M * THREADS_PER_CTA.
+  size_t packed_mask_size = b * mmas_m * threads_per_cta;
+  // Flash attention on Ampere and Hopper, which supports multiple mmas_n
+  if (!v1 && !force_non_flash_attention &&
+      attention_mask_type == Attention_mask_type::CUSTOM_MASK) {
+    // We need to align q and k sequence lengths.
+    size_t rounded_q_s = align_to(s, size_t(fmha::FLASH_ATTEN_MASK_M_ALIGNMENT));
+    size_t rounded_k_s = align_to(s, size_t(fmha::FLASH_ATTEN_MASK_N_ALIGNMENT));
+    // The number of mmas in the M dimension (MMA_M = 64).
+    mmas_m = rounded_q_s / fmha::FLASH_ATTEN_MASK_MMA_M;
+    // The number of mmas in the N dimension (MMA_N = 64).
+    mmas_n = rounded_k_s / fmha::FLASH_ATTEN_MASK_MMA_N;
+    // Each thread holds 32 bit (2 rows, 16 cols -> 8 core MMAs) in one MMA here.
+    packed_mask_size = b * mmas_m * mmas_n * threads_per_cta;
+  }
+  // The size in bytes.
+  const size_t packed_mask_size_in_bytes = packed_mask_size * sizeof(uint32_t);
+  // Allocate on the host.
+  uint32_t* packed_mask_h = (uint32_t*)malloc(packed_mask_size_in_bytes);
+  // Set it to 0 (indicates that all elements are valid).
+  memset(packed_mask_h, 0, packed_mask_size_in_bytes);
+  // Allocate on the device.
+  void* packed_mask_d = nullptr;
+
+  // The size of the attention sinks.
+  const size_t attention_sinks_size_in_bytes = h * sizeof(float);
+
+  // The attention sinks.
+  void* attention_sinks_d = nullptr;
+  if (use_attention_sinks) {
+    // Allocate on the host.
+    float* attention_sinks_h = (float*)malloc(attention_sinks_size_in_bytes);
+    // Randomly initialize the attention sinks.
+    random_init("attention_sinks", attention_sinks_h, 1, h, 1, false, 5.f, 1.f, verbose);
+    // Allocate on the device.
+    FMHA_CHECK_CUDA(cudaMalloc(&attention_sinks_d, attention_sinks_size_in_bytes));
+    // Copy from the host to the device.
+    FMHA_CHECK_CUDA(cudaMemcpy(attention_sinks_d, attention_sinks_h, attention_sinks_size_in_bytes,
+                               cudaMemcpyDefault));
+  }
+
+  // The O matrix is packed as S * B * H * D.
+  const size_t o_size = s * b * h * dv;
+  // Allocate on the host.
+  float* o_h = (float*)malloc(o_size * sizeof(float));
+  // The size in bytes.
+  const size_t o_size_in_bytes = get_size_in_bytes(o_size, data_type);
+  // Allocate on the device.
+  void* o_d = nullptr;
+  FMHA_CHECK_CUDA(cudaMalloc(&o_d, o_size_in_bytes));
+
+  // The softmax_stats_d vector is used to store the max/sum of the softmax per token
+  void* softmax_stats_d;
+  FMHA_CHECK_CUDA(cudaMalloc(&softmax_stats_d, 2 * sizeof(float) * b * s * h));
+  FMHA_CHECK_CUDA(cudaMemset(softmax_stats_d, 0x00, 2 * sizeof(float) * b * s * h));
+
+  // The size in bytes.
+  const size_t tmp_size_in_bytes = get_size_in_bytes(o_size, acc_type);
+  // Allocate on the device.
+  void* tmp_d = nullptr;
+  if (data_type != acc_type) {
+    FMHA_CHECK_CUDA(cudaMalloc(&tmp_d, tmp_size_in_bytes));
+  }
+
+  // Allocate the reference on the host.
+  float* o_ref_h = (float*)malloc(o_size * sizeof(float));
+  float* softmax_stats_ref_h = (float*)malloc(2 * b * s * h * sizeof(float));
+  float* softmax_stats_h = (float*)malloc(2 * b * s * h * sizeof(float));
+
+  // The P matrix is stored as one big matrix of size S x B x H x S.
+  const size_t p_size = s * b * h * s;
+  // The size in bytes.
+  const size_t p_size_in_bytes = get_size_in_bytes(p_size, acc_type);
+  // Allocate on the device.
+  void* p_d = nullptr;
+  if (!skip_checks) {
+    FMHA_CHECK_CUDA(cudaMalloc(&p_d, p_size_in_bytes));
+  }
+
+  // Allocate the reference on the host.
+  float* p_ref_h = (float*)malloc(p_size * sizeof(float));
+#if defined(STORE_P)
+  // Allocate on the host.
+  float* p_h = (float*)malloc(p_size * sizeof(float));
+#endif  // defined(STORE_P)
+
+  // The size in bytes of the S matrix (the data type may be different from P for int8).
+  const size_t s_size_in_bytes = get_size_in_bytes(p_size, data_type);
+  // Allocate on the device.
+  void* s_d = nullptr;
+  if (!skip_checks) {
+    FMHA_CHECK_CUDA(cudaMalloc(&s_d, s_size_in_bytes));
+  }
+
+  // Allocate the reference on the host.
+  float* s_ref_h = (float*)malloc(p_size * sizeof(float));
+
+  // Allocate on the host.
+  float* s_h = (float*)malloc(p_size * sizeof(float));
+  // Make sure we set the seed for reproducible results.
+  srand(1234UL);
+
+  // Set the Q, K and V matrices.
+  random_init("Q", qkv_h + 0 * d, d, s * b * h, 2 * d + dv, use_1s_q, range_q, scale_q, verbose);
+  random_init("K", qkv_h + 1 * d, d, s * b * h, 2 * d + dv, use_1s_k, range_k, scale_k, verbose);
+  random_init("V", qkv_h + 2 * d, dv, s * b * h, 2 * d + dv, use_1s_v, range_v, scale_v, verbose);
+  // iota_init("Q", qkv_h + 0 * d, d, s * b * h, 3 * d, use_1s_q, range_q, scale_q, verbose, true,
+  // 0); iota_init("K", qkv_h + 1 * d, d, s * b * h, 3 * d, use_1s_k, range_k, scale_k, verbose,
+  // true, 128); iota_init("V", qkv_h + 2 * d, d, s * b * h, 3 * d, use_1s_v, range_v, scale_v,
+  // verbose, true, 256);
+
+  // Multi-query or grouped-query attention for reference input
+  if (multi_query_attention) {
+    for (size_t sbi = 0; sbi < s * b; sbi++) {
+      for (size_t hi = 0; hi < h; hi++) {
+        for (size_t di = 0; di < d; di++) {
+          // E.g., h=8, h_kv=4
+          //            hi: 0, 1, 2, 3, 4, 5, 6, 7
+          // hi_kv_scatter: 0, 0, 2, 2, 4, 4, 6, 6
+          int const h_per_group = h / h_kv;
+          int const hi_kv_scatter = (hi / h_per_group) * h_per_group;
+          size_t src_offset =
+              sbi * h * 3 * d + hi_kv_scatter * 3 * d + di;       // [sbi, hi_kv_scatter, 0, di]
+          size_t dst_offset = sbi * h * 3 * d + hi * 3 * d + di;  // [sbi, hi, 0, di]
+
+          // make sure all heads of kv in a group share the same d
+          qkv_h[dst_offset + 1 * d] =
+              qkv_h[src_offset + 1 * d];  // qkv[sbi, hi, 1, di] = qkv[sbi, hi_kv_scatter, 1, di]
+          qkv_h[dst_offset + 2 * d] =
+              qkv_h[src_offset + 2 * d];  // qkv[sbi, hi, 2, di] = qkv[sbi, hi_kv_scatter, 2, di]
+        }
+      }
+    }
+  }
+
+  //   WAR fOR MISSING CUBLAS FP8 NN SUPPORT.
+  //   Transpose V, so that we can do a TN BMM2, i.e. O = S x V'  instead of O = S x V.
+  float* vt_h = (float*)malloc(o_size * sizeof(float));
+  void* vt_d = nullptr;
+  FMHA_CHECK_CUDA(cudaMalloc(&vt_d, o_size_in_bytes));
+  for (size_t it = 0; it < o_size; it++) {
+    // vt is B x H x D x S
+    size_t si = it % s;
+    size_t di = (it / s) % dv;
+    size_t hi = ((it / s) / dv) % h;
+    size_t bi = (((it / s) / dv) / h) % b;
+    // qkv is S x B x H x 3 x D
+    size_t qkv_idx = si * b * h * (2 * d + dv) + bi * h * (2 * d + dv) + hi * (2 * d + dv) +
+                     2 * d  // index V here
+                     + di;
+    vt_h[it] = qkv_h[qkv_idx];
+  }
+  FMHA_CHECK_CUDA(cuda_memcpy_h2d(vt_d, vt_h, o_size, data_type));
+
+  // // DEBUG.
+  // float sum = 0.f;
+  // for( size_t si = 0; si < s; ++si ) {
+  //   float v = qkv_h[si*b*h*3*d + 2*d];
+  //   printf("V[%3d]=%8.3f\n", si, v);
+  //   sum += v;
+  // }
+  // printf("Sum of V = %8.3f\n", sum);
+  // // END OF DEBUG.
+
+  // Copy from the host to the device.
+  FMHA_CHECK_CUDA(cuda_memcpy_h2d(qkv_sbh3d_d, qkv_h, qkv_size, data_type));
+
+  // Create the buffer of mask.
+  // if(verbose) {printf("Init .........: mask\n"); }
+  // random_init_with_zeroes_or_ones(mask_h, b*s, false, 1.f - dropout, verbose);
+
+  std::vector<uint32_t> seqlens(b, 0);  // randomly draw a batch of sequence lengths >= min_s
+  std::transform(seqlens.begin(), seqlens.end(), seqlens.begin(), [=](const uint32_t) {
+    if (fix_s) {
+      return std::min(uint32_t(s), min_s);
+    }
+    if (s == min_s) {
+      return min_s;
+    }
+    uint32_t s_ = s - min_s + 1;
+    uint32_t ret = min_s + (rand() % s_);
+    assert(ret <= s);
+    return ret;
+  });
+
+  // Compute the prefix sum of the sequence lengths.
+  std::vector<int> cu_seqlens(b + 1, 0);
+  for (int it = 0; it < b; it++) {
+    cu_seqlens[it + 1] = cu_seqlens[it] + seqlens[it];
+  }
+  int total = cu_seqlens.back();
+  seqlens.emplace_back(total);
+
+  // Different q and kv sequence lengths.
+  std::vector<uint32_t> q_seqlens = seqlens;
+  std::vector<int> cu_q_seqlens = cu_seqlens;
+  if (different_q_kv_lengths) {
+    for (int it = 0; it < b; it++) {
+      q_seqlens[it] = s_q;
+      cu_q_seqlens[it + 1] = cu_q_seqlens[it] + q_seqlens[it];
+    }
+  }
+
+  // Compute the prefix sum of the mask sequence lengths.
+  std::vector<int> cu_mask_rows(b + 1, 0);
+  // The mask_h row offset in each sequence to support s_q < s_kv.
+  // we only need the last s_q rows in the [s, s] mask_h.
+  std::vector<int> mask_h_row_offsets(b);
+  for (int it = 0; it < b; it++) {
+    // The actual q sequence length.
+    int actual_q_seqlen = q_seqlens[it];
+    // The mask_h row offset.
+    mask_h_row_offsets[it] = seqlens[it] - q_seqlens[it];
+    // Round up the sequence length to multiple of 128.
+    int mask_seqlen = align_to(actual_q_seqlen, fmha::FLASH_ATTEN_MASK_M_ALIGNMENT);
+    cu_mask_rows[it + 1] = cu_mask_rows[it] + mask_seqlen;
+  }
+
+  // transfer to device
+  void *cu_seqlens_d, *cu_q_seqlens_d, *cu_mask_rows_d;
+  FMHA_CHECK_CUDA(cudaMalloc(&cu_seqlens_d, sizeof(int) * cu_seqlens.size()));
+  FMHA_CHECK_CUDA(cudaMalloc(&cu_q_seqlens_d, sizeof(int) * cu_q_seqlens.size()));
+  FMHA_CHECK_CUDA(cudaMalloc(&cu_mask_rows_d, sizeof(int) * cu_mask_rows.size()));
+  FMHA_CHECK_CUDA(cudaMemcpy(cu_seqlens_d, cu_seqlens.data(), sizeof(int) * cu_seqlens.size(),
+                             cudaMemcpyHostToDevice));
+  FMHA_CHECK_CUDA(cudaMemcpy(cu_q_seqlens_d, cu_q_seqlens.data(), sizeof(int) * cu_q_seqlens.size(),
+                             cudaMemcpyHostToDevice));
+  FMHA_CHECK_CUDA(cudaMemcpy(cu_mask_rows_d, cu_mask_rows.data(), sizeof(int) * cu_mask_rows.size(),
+                             cudaMemcpyHostToDevice));
+
+  size_t qkv_packed_size = cu_seqlens.back() * h * (2 * d + dv);
+  size_t qkv_packed_size_in_bytes = get_size_in_bytes(qkv_packed_size, data_type);
+  void* qkv_packed_d = nullptr;
+  FMHA_CHECK_CUDA(cudaMalloc(&qkv_packed_d, qkv_packed_size_in_bytes));
+
+  // Specify device buffers for multi-query attention or grouped-query attention
+  // TODO: Use the same buffer for all cases, and allow to set name to aid tracing/debugging
+  // e.g.,
+  //   Buffer<float> qkv_buf(size);
+  //   if( packed ) { qkv_buf.set_name("QKV_packed[total, h, 3, d]"); }
+  //   else { qkv_buf.set_name("QKV_padded[b, s, h, 3, d]"); }
+  //   qkv_buf.copy_to_device();
+  //   float *qkv_buf_d = qkv_buf.get_device_buf();
+  // Or, more aggressively, use torch::Tensor from PyTorch ATen
+  size_t mqa_qkv_packed_size = cu_seqlens.back() * (h + 2 * h_kv) * d;
+  size_t mqa_qkv_packed_size_in_bytes = get_size_in_bytes(mqa_qkv_packed_size, data_type);
+  size_t mqa_qkv_size = b * s * (h + 2 * h_kv) * d;  // original padded tensor
+  size_t mqa_qkv_size_in_bytes = get_size_in_bytes(mqa_qkv_size, data_type);
+  void* mqa_qkv_packed_d = nullptr;
+  void* mqa_qkv_d = nullptr;
+  FMHA_CHECK_CUDA(cudaMalloc(&mqa_qkv_packed_d, mqa_qkv_packed_size_in_bytes));
+  FMHA_CHECK_CUDA(cudaMalloc(&mqa_qkv_d, mqa_qkv_size_in_bytes));
+
+  const size_t o_packed_size = cu_seqlens.back() * h * dv;
+  // Allocate on the host.
+  float* o_packed_h = (float*)malloc(o_packed_size * sizeof(float));
+  void* o_packed_d = nullptr;
+
+  size_t o_packed_size_in_bytes = get_size_in_bytes(o_packed_size, output_dtype);
+  FMHA_CHECK_CUDA(cudaMalloc(&o_packed_d, o_packed_size_in_bytes));
+
+  // qkv_packed_h is TotalH3D
+  std::vector<float> qkv_packed_h(qkv_packed_size);
+  extract_and_transpose_input<float>(qkv_packed_h.data(), qkv_h, seqlens, s, b, h, d, dv, 3, false);
+  if (interleaved) {
+    x_vec32(true, qkv_packed_h.data(), h, total, 3);
+  }
+
+  // qkv_h is SBH3D
+  // qkv_bsh3d_h is BSH3D
+  std::vector<float> qkv_bsh3d_h(qkv_size);
+  extract_and_transpose_input<float>(qkv_bsh3d_h.data(), qkv_h, seqlens, s, b, h, d, dv, 3,
+                                     is_s_padded);
+  if (interleaved) {
+    x_vec32(true, qkv_bsh3d_h.data(), h, b * h, 3);
+  }
+
+  std::vector<float> mqa_qkv_packed_h(mqa_qkv_packed_size);
+  std::vector<float> mqa_qkv_h(mqa_qkv_size);
+  // for now MLA doesn't use MQA, may enable it in the future
+  if (d == dv) {
+    // from qkv[s, h, 3, d] to mqa_qkv[s, h + 2*h_kv, d]
+    // where
+    //  Q is qkv[s, h, 0, d],
+    //  K is qkv[s, h, 1, d],
+    //  V is qkv[s, h, 2, d]
+    // and
+    //  MQA_Q is mqa_qkv[s, h, [       0 :          h - 1], d],
+    //  MQA_K is mqa_qkv[s, h, [       h :   h + h_kv - 1], d],
+    //  MQA_V is mqa_qkv[s, h, [h + h_kv : h + 2*h_kv - 1], d]
+    for (size_t si = 0; si < cu_seqlens.back(); si++) {
+      for (size_t hi = 0; hi < h; hi++) {
+        for (size_t di = 0; di < d; di++) {
+          // Q: [si, hi, di] <- [si, hi, 0, di]
+          mqa_qkv_packed_h[si * (h + 2 * h_kv) * d + hi * d + di] =
+              qkv_packed_h[si * h * 3 * d + hi * 3 * d + 0 * d + di];
+          if (hi < h_kv) {
+            // E.g., h=8, h_kv=4
+            //     src kv id: 0, 0, 1, 1, 2, 2, 3, 3
+            //            hi: 0, 1, 2, 3, 4, 5, 6, 7
+            // hi_kv_scatter: 0, 2, 4, 6, x, x, x, x
+            int const h_per_group = h / h_kv;
+            int const hi_kv_scatter = hi * h_per_group;
+            // K: [si, h + hi, di] <- [si, hi_kv_scatter, 1, di]
+            mqa_qkv_packed_h[si * (h + 2 * h_kv) * d + (h + hi) * d + di] =
+                qkv_packed_h[si * 3 * h * d + hi_kv_scatter * 3 * d + 1 * d + di];
+            // V: [si, h + h_kv + hi, di] <- [si, hi_kv_scatter, 2, di]
+            mqa_qkv_packed_h[si * (h + 2 * h_kv) * d + (h + h_kv + hi) * d + di] =
+                qkv_packed_h[si * 3 * h * d + hi_kv_scatter * 3 * d + 2 * d + di];
+          }
+        }
+      }
+    }
+
+    // from qkv_bsh3d_h[b, s, h, 3, d] to mqa_qkv[b, s, h + 2*h_kv, d]
+    for (size_t bi = 0; bi < b; bi++) {
+      int actual_s = seqlens[bi];
+      for (size_t si = 0; si < actual_s; si++) {
+        for (size_t hi = 0; hi < h; hi++) {
+          for (size_t di = 0; di < d; di++) {
+            mqa_qkv_h[bi * s * (h + 2 * h_kv) * d + si * (h + 2 * h_kv) * d + hi * d + di] =
+                qkv_bsh3d_h[bi * s * h * 3 * d + si * h * 3 * d + hi * 3 * d + 0 * d + di];
+            if (hi < h_kv) {
+              // E.g., h=8, h_kv=4
+              //     src kv id: 0, 0, 1, 1, 2, 2, 3, 3
+              //            hi: 0, 1, 2, 3, 4, 5, 6, 7
+              // hi_kv_scatter: 0, 2, 4, 6, x, x, x, x
+              int const h_per_group = h / h_kv;
+              int const hi_kv_scatter = hi * h_per_group;
+              mqa_qkv_h[bi * s * (h + 2 * h_kv) * d + si * (h + 2 * h_kv) * d + (h + hi) * d + di] =
+                  qkv_bsh3d_h[bi * s * h * 3 * d + si * h * 3 * d + hi_kv_scatter * 3 * d + 1 * d +
+                              di];
+              mqa_qkv_h[bi * s * (h + 2 * h_kv) * d + si * (h + 2 * h_kv) * d +
+                        (h + h_kv + hi) * d + di] =
+                  qkv_bsh3d_h[bi * s * h * 3 * d + si * h * 3 * d + hi_kv_scatter * 3 * d + 2 * d +
+                              di];
+            }
+          }
+        }
+      }
+    }
+  }
+  // if( verbose ) {
+  //     print_tensor(qkv_packed_h.data() + 0 * d, d, total * h, 3 * d, "Packed Q[bs, h, d]");
+  //     print_tensor(qkv_packed_h.data() + 1 * d, d, total * h, 3 * d, "Packed K[bs, h, d]");
+  //     print_tensor(qkv_packed_h.data() + 2 * d, d, total * h, 3 * d, "Packed V[bs, h, d]");
+
+  //     print_tensor(mqa_qkv_packed_h.data() + 0 * d,            h * d,    total, (h + 2 * h_kv) *
+  //     d, "Packed MQA Q[bs, h*d]"); print_tensor(mqa_qkv_packed_h.data() + h * d,            h_kv
+  //     * d, total, (h + 2 * h_kv) * d, "Packed MQA K[bs, h_kv*d]");
+  //     print_tensor(mqa_qkv_packed_h.data() + h * d + h_kv * d, h_kv * d, total, (h + 2
+  //     * h_kv) * d, "Packed MQA V[bs, h_kv*d]");
+
+  //     print_tensor(qkv_bsh3d_h.data() + 0 * d, d, b * h * s, 3 * d, "Padded Q[b, s, h, d]");
+  //     print_tensor(qkv_bsh3d_h.data() + 1 * d, d, b * h * s, 3 * d, "Padded K[b, s, h, d]");
+  //     print_tensor(qkv_bsh3d_h.data() + 2 * d, d, b * h * s, 3 * d, "Padded V[b, s, h, d]");
+
+  //     print_tensor(mqa_qkv_h.data() + 0 * d,            h * d,    b * s, (h + 2 * h_kv) * d,
+  //     "Padded MQA Q[b, s, h*d]"); print_tensor(mqa_qkv_h.data() + h * d,            h_kv * d, b *
+  //     s, (h + 2 * h_kv) * d, "Padded MQA K[b, s, h_kv*d]"); print_tensor(mqa_qkv_h.data() + h * d
+  //     + h_kv * d, h_kv * d, b * s, (h + 2 * h_kv) * d, "Padded MQA V[b, s, h_kv*d]");
+  // }
+
+  // Contiguous KV Cache and Separate KV Cache.
+  store_q_and_contiguous_kv_cache(q_d, k_d, v_d, contiguous_kv_h, contiguous_kv_d,
+                                  reinterpret_cast<float const*>(qkv_packed_h.data()),
+                                  reinterpret_cast<int const*>(cu_seqlens.data()),
+                                  reinterpret_cast<int const*>(cu_q_seqlens.data()), b, s, h, h_kv,
+                                  d, dv, data_type);
+
+  // Paged KV Cache.
+  store_paged_kv_cache(kv_cache_ptrs_h, reinterpret_cast<float const*>(qkv_packed_h.data()),
+                       reinterpret_cast<int const*>(cu_seqlens.data()), max_blocks_per_seq,
+                       tokens_per_block, b, h, h_kv, d, dv, data_type);
+
+  // Copy packed, padded, mqa packed, mqa padded data buffers
+  // TODO: use the same buffer for all cases
+  FMHA_CHECK_CUDA(cuda_memcpy_h2d(qkv_packed_d, qkv_packed_h.data(), qkv_packed_size, data_type));
+  FMHA_CHECK_CUDA(
+      cuda_memcpy_h2d(mqa_qkv_packed_d, mqa_qkv_packed_h.data(), mqa_qkv_packed_size, data_type));
+  FMHA_CHECK_CUDA(cuda_memcpy_h2d(mqa_qkv_d, mqa_qkv_h.data(), mqa_qkv_size, data_type));
+  FMHA_CHECK_CUDA(cuda_memcpy_h2d(qkv_bsh3d_d, qkv_bsh3d_h.data(), qkv_size, data_type));
+
+  // Is MTP used?
+  bool is_mtp = (d == 576 && dv == 512);
+
+  for (size_t so = 0; so < s; ++so) {  // s_q
+    for (size_t bi = 0; bi < b; ++bi) {
+      int actual_seqlen = seqlens[bi];
+      for (size_t si = 0; si < s; ++si) {  // s_kv
+        // Are both the query and the key inside the sequence?
+        bool valid = (si < actual_seqlen) && (so < actual_seqlen);
+        // FIXME: add random mask generator.
+        //  attention_mask_type == Attention_mask_type::CUSTOM_MASK
+        if (attention_mask_type == Attention_mask_type::CUSTOM_MASK ||
+            attention_mask_type == Attention_mask_type::CAUSAL ||
+            attention_mask_type == Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL) {
+          valid = valid && (so >= si);
+        }
+        if (attention_mask_type == Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL) {
+          if (chunked_attention_size > 0) {
+            int chunk_idx = so / chunked_attention_size;
+            valid = valid && (si >= (chunk_idx * chunked_attention_size));
+          } else {
+            valid = valid && (si >= std::max(int(so + 1 - sliding_window_size), 0));
+          }
+        }
+        if (is_mtp) {
+          // Only the last s_q tokens are used for verifying the results.
+          size_t idx = so - (actual_seqlen - s_q);
+          size_t num_mtp_tokens = s_q / num_grouped_heads;
+          size_t mtp_token_idx = idx / num_grouped_heads;
+          valid = idx >= 0 && si < (actual_seqlen - num_mtp_tokens + 1 + mtp_token_idx) &&
+                  (so < actual_seqlen);
+        }
+        if (!skip_checks) {
+          // The mask is stored as floats.
+          mask_h[so * b * s + bi * s + si] = valid ? 1.f : 0.f;  // mask dims [s_q, b, s_kv]
+        }
+      }
+    }
+  }
+
+  if (verbose) {
+    printf("Sequence lengths (first 10 batches): ");
+    for (int bi = 0; bi < seqlens.size() && bi < 10; bi++) {
+      printf("%d, ", seqlens[bi]);
+    }
+    printf("\n");
+  }
+
+  if (v1) {
+    assert(!interleaved && "Interleaved not supported in v1");
+    assert(mmas_n <= 4 && "Not supported");
+
+    FMHA_CHECK_CUDA(cudaMalloc(&packed_mask_d, packed_mask_size_in_bytes));
+    if (sm == 70) {
+      pack_mask_sm70(packed_mask_h, mask_h, s, b, mmas_m, mmas_n, warps_m, warps_n,
+                     threads_per_cta);
+    } else {
+      pack_mask(packed_mask_h, mask_h, s, b, mmas_m, mmas_n, warps_m, warps_n, threads_per_cta);
+    }
+
+    // Copy the packed mask to the device.
+    if (!skip_checks) {
+      FMHA_CHECK_CUDA(cudaMemcpy(packed_mask_d, packed_mask_h, packed_mask_size_in_bytes,
+                                 cudaMemcpyHostToDevice));
+    }
+  } else if (attention_mask_type == Attention_mask_type::CUSTOM_MASK) {
+    FMHA_CHECK_CUDA(cudaMalloc(&packed_mask_d, packed_mask_size_in_bytes));
+    assert(fmha::FLASH_ATTEN_MASK_MMA_M == warps_m * 16 && "Not supported");
+    assert(fmha::FLASH_ATTEN_MASK_MMA_N / 8 == 8 && "Not supported");
+    pack_flash_attention_mask(packed_mask_h, mask_h, b, s, warps_m, warps_n, threads_per_cta,
+                              mmas_n, fmha::FLASH_ATTEN_MASK_MMA_N / 8, mask_h_row_offsets.data(),
+                              cu_mask_rows.data());
+
+    // Copy the packed mask to the device.
+    FMHA_CHECK_CUDA(cudaMemcpy(packed_mask_d, packed_mask_h, packed_mask_size_in_bytes,
+                               cudaMemcpyHostToDevice));
+  }
+
+  // Copy the mask to the device.
+  if (!skip_checks) {
+    FMHA_CHECK_CUDA(cuda_memcpy_h2d(mask_d, mask_h, mask_size, DATA_TYPE_INT8));
+  }
+
+  // non-owning pointer to the IO buffer
+  void* qkv_d_view = nullptr;
+  void* o_d_view = nullptr;
+  int o_view_size = 0;
+  if (is_s_padded) {
+    qkv_d_view = multi_query_attention ? mqa_qkv_d : qkv_bsh3d_d;
+    o_d_view = o_d;
+    o_view_size = o_size;
+  } else {
+    qkv_d_view = multi_query_attention ? mqa_qkv_packed_d : qkv_packed_d;
+    o_d_view = o_packed_d;
+    o_view_size = o_packed_size;
+  }
+  void* softmax_stats_ptr = save_softmax ? softmax_stats_d : nullptr;
+  // Set the params.
+  bert::Fused_multihead_attention_params_v1 params_v1;
+  printf("=== set_params() arguments ===\n");
+  printf("launch_params: ...\n");  // For struct, maybe print pointer or describe
+  printf("data_type: %d\n", int(data_type));
+  printf("acc_type: %d\n", int(acc_type));
+  printf("output_dtype: %d\n", int(output_dtype));
+  printf("input_layout: %d\n", int(input_layout));
+  printf("b: %zu\n", size_t(b));
+  printf("s_q: %zu\n", size_t(s_q));
+  printf("s: %zu\n", size_t(s));
+  printf("h: %zu\n", size_t(h));
+  printf("h_kv: %zu\n", size_t(h_kv));
+  printf("d: %zu\n", size_t(d));
+  printf("dv: %zu\n", size_t(dv));
+  printf("total: %zu\n", size_t(total));
+  printf("num_grouped_heads: %zu\n", size_t(num_grouped_heads));
+  printf("sliding_window_size: %zu\n", size_t(sliding_window_size));
+  printf("chunked_attention_size: %zu\n", size_t(chunked_attention_size));
+  printf("tokens_per_block: %zu\n", size_t(tokens_per_block));
+  printf("qkv_d_view: %p\n", qkv_d_view);
+  printf("q_d: %p\n", q_d);
+  printf("k_d: %p\n", k_d);
+  printf("v_d: %p\n", v_d);
+  printf("contiguous_kv_d: %p\n", contiguous_kv_d);
+  printf("kv_cache_pool_ptr: %p\n", kv_cache_pool_ptr);
+  printf("kv_cache_block_offsets_d: %p\n", kv_cache_block_offsets_d);
+  printf("packed_mask_d: %p\n", packed_mask_d);
+  printf("cu_mask_rows_d: %p\n", cu_mask_rows_d);
+  printf("attention_sinks_d: %p\n", attention_sinks_d);
+  printf("cu_seqlens_d: %p\n", cu_seqlens_d);
+  printf("cu_q_seqlens_d: %p\n", cu_q_seqlens_d);
+  printf("o_d_view: %p\n", o_d_view);
+  printf("p_d: %p\n", p_d);
+  printf("s_d: %p\n", s_d);
+  printf("softmax_stats_ptr: %p\n", softmax_stats_ptr);
+  printf("scale_bmm2_d: %p\n", scale_bmm2_d);
+  printf("scale_bmm1: %f\n", scale_bmm1);
+  printf("scale_softmax: %f\n", scale_softmax);
+  printf("scale_bmm2: %f\n", scale_bmm2);
+  printf("softcapping_scale_bmm1: %f\n", softcapping_scale_bmm1);
+  printf("use_int8_scale_max: %d\n", int(use_int8_scale_max));
+  printf("interleaved: %d\n", int(interleaved));
+  printf("is_s_padded: %d\n", int(is_s_padded));
+  printf("has_alibi: %d\n", int(has_alibi));
+  printf("=============================\n");
+  set_params(params_v1, data_type, acc_type, b, s, h, d, mmas_m * threads_per_cta, qkv_sbh3d_d,
+             packed_mask_d, o_d, p_d, s_d, scale_bmm1, scale_softmax, scale_bmm2, has_alibi);
+
+  bert::Fused_multihead_attention_params_v2 params_v2;
+  set_params(params_v2, launch_params, data_type, acc_type, output_dtype, input_layout, b, s_q, s,
+             h, h_kv, d, dv, total, num_grouped_heads, sliding_window_size, chunked_attention_size,
+             // Paged kv cache.
+             tokens_per_block, qkv_d_view, q_d, k_d, v_d, contiguous_kv_d, kv_cache_pool_ptr,
+             kv_cache_block_offsets_d, packed_mask_d, cu_mask_rows_d, attention_sinks_d,
+             cu_seqlens_d, cu_q_seqlens_d, o_d_view, p_d, s_d, softmax_stats_ptr, scale_bmm2_d,
+             scale_bmm1, scale_softmax, scale_bmm2, softcapping_scale_bmm1, use_int8_scale_max,
+             interleaved, is_s_padded, has_alibi);
+
+  // total number of tokens is needed to set TMA desc on the host.
+  launch_params.total_q_seqlen = q_seqlens[b];
+  launch_params.total_kv_seqlen = seqlens[b];
+  // set enable_attn_logit_softcapping to select the right kernel.
+  launch_params.enable_attn_logit_softcapping = softcapping_scale_bmm1 != 0.f;
+
+  // Allocate barriers and locks.
+  void* counters_d = nullptr;
+  if (ctas_per_head > 1) {
+    size_t sz = heads_per_wave * sizeof(int);
+    FMHA_CHECK_CUDA(cudaMalloc((void**)&counters_d, 3 * sz));
+  }
+
+  // Allocate scratch storage for softmax.
+  void *max_scratch_d = nullptr, *sum_scratch_d = nullptr;
+  if (ctas_per_head > 1) {
+    size_t sz = heads_per_wave * ctas_per_head * threads_per_cta * sizeof(float);
+    FMHA_CHECK_CUDA(cudaMalloc((void**)&max_scratch_d, sz));
+    FMHA_CHECK_CUDA(cudaMalloc((void**)&sum_scratch_d, sz));
+  }
+
+  // Allocate temporary storage for the parallel reduction.
+  void* o_scratch_d = nullptr;
+  if (ctas_per_head > 1 && data_type != DATA_TYPE_FP16) {
+    size_t sz = heads_per_wave * threads_per_cta * MAX_STGS_PER_LOOP * sizeof(uint4);
+    FMHA_CHECK_CUDA(cudaMalloc((void**)&o_scratch_d, sz));
+  }
+
+  // Allocate tile id for dynamic scheduling
+  void* tile_id_counter_d = nullptr;
+  FMHA_CHECK_CUDA(cudaMalloc((void**)&tile_id_counter_d, sizeof(uint32_t)));
+
+  // The number of heads computed per wave.
+  params_v1.heads_per_wave = heads_per_wave;
+  params_v2.heads_per_wave = heads_per_wave;
+
+  // Barriers for the global sync in the multi-CTA kernel(s).
+  params_v1.counters = (int*)counters_d + 0 * heads_per_wave;
+  params_v2.counters = (int*)counters_d + 0 * heads_per_wave;
+  params_v1.max_barriers = (int*)counters_d + 0 * heads_per_wave;
+  params_v2.max_barriers = (int*)counters_d + 0 * heads_per_wave;
+  params_v1.sum_barriers = (int*)counters_d + 1 * heads_per_wave;
+  params_v2.sum_barriers = (int*)counters_d + 1 * heads_per_wave;
+  params_v1.locks = (int*)counters_d + 2 * heads_per_wave;
+  params_v2.locks = (int*)counters_d + 2 * heads_per_wave;
+
+  // Scratch storage for softmax.
+  params_v1.max_scratch_ptr = (float*)max_scratch_d;
+  params_v2.max_scratch_ptr = (float*)max_scratch_d;
+  params_v1.sum_scratch_ptr = (float*)sum_scratch_d;
+  params_v2.sum_scratch_ptr = (float*)sum_scratch_d;
+
+  // Scratch storage for output.
+  params_v1.o_scratch_ptr = (int*)o_scratch_d;
+  params_v2.o_scratch_ptr = (int*)o_scratch_d;
+
+  // Tile id counter for dynamic scheduling
+  params_v2.tile_id_counter_ptr = (uint32_t*)tile_id_counter_d;
+  // params_paged_v2.tile_id_counter_ptr = (uint32_t*) tile_id_counter_d;
+
+  if (sage_block_size_q > 0 || sage_block_size_k > 0 || sage_block_size_v > 0) {
+    assert(input_layout == Attention_input_layout::PACKED_QKV &&
+           "for now this test only supports PACKED_QKV");
+    assert(d == dv && "for now SageAttention doesn't support different QKV dims");
+    assert(((sm == 90 && !force_non_warp_specialization) || (sm == 89)) &&
+           "only hopper and ada kernels support SageAttention");
+    fmha::e4m3_t* quant_qkv;
+    FMHA_CHECK_CUDA(cudaMalloc((void**)&quant_qkv, qkv_packed_size));
+    params_v2.sage.q.block_size = sage_block_size_q;
+    params_v2.sage.q.max_nblock = (s + sage_block_size_q - 1) / sage_block_size_q;
+    FMHA_CHECK_CUDA(cudaMalloc((void**)&params_v2.sage.q.scales,
+                               params_v2.sage.q.max_nblock * h * b * sizeof(float)));
+    params_v2.sage.k.block_size = sage_block_size_k;
+    params_v2.sage.k.max_nblock = (s + sage_block_size_k - 1) / sage_block_size_k;
+    FMHA_CHECK_CUDA(cudaMalloc((void**)&params_v2.sage.k.scales,
+                               params_v2.sage.k.max_nblock * h * b * sizeof(float)));
+    params_v2.sage.v.block_size = sage_block_size_v;
+    params_v2.sage.v.max_nblock = (s + sage_block_size_v - 1) / sage_block_size_v;
+    FMHA_CHECK_CUDA(cudaMalloc((void**)&params_v2.sage.v.scales,
+                               params_v2.sage.v.max_nblock * h * b * sizeof(float)));
+#if 1
+    {
+      // simple test, all scales are the same
+      constexpr float const_scale = 0.618f;
+      fmha::e4m3_t* quant_qkv_h = (fmha::e4m3_t*)malloc(qkv_packed_size);
+      for (size_t i = 0; i < qkv_packed_size; i++) {
+        quant_qkv_h[i] = fmha::e4m3_t(qkv_packed_h[i] / const_scale);
+      }
+      FMHA_CHECK_CUDA(cudaMemcpy(quant_qkv, quant_qkv_h, qkv_packed_size, cudaMemcpyHostToDevice));
+      free(quant_qkv_h);
+      auto init_scales = [&](bert::Fused_multihead_attention_params_v2::SageAttention::Scales& x) {
+        std::vector<float> scales(x.max_nblock * h * b, const_scale);
+        FMHA_CHECK_CUDA(cudaMemcpy(x.scales, scales.data(), sizeof(float) * scales.size(),
+                                   cudaMemcpyHostToDevice));
+      };
+      init_scales(params_v2.sage.q);
+      init_scales(params_v2.sage.k);
+      init_scales(params_v2.sage.v);
+    }
+#else
+    {
+      // use external quant kernel
+            run_sage_quant(b, h, d, s, params_v2.qkv_ptr,
+                (char*) params_v2.qkv_ptr + get_size_in_bytes(h * d, data_type),
+                (char*) params_v2.qkv_ptr + get_size_in_bytes(2 * h * d, data_type,
+                params_v2.q_stride_in_bytes,
+                params_v2.k_stride_in_bytes,
+                params_v2.v_stride_in_bytes,
+                params_v2.cu_q_seqlens, params_v2.cu_kv_seqlens, sage_block_size_q, sage_block_size_k,
+                sage_block_size_v, quant_qkv, quant_qkv + h * d, quant_qkv + 2 * h * d, params_v2.sage.q.scales,
+                params_v2.sage.k.scales, params_v2.sage.v.scales);
+    }
+#endif
+    // no need to free old params_v2.qkv_ptr, it will be released in the end
+    params_v2.qkv_ptr = quant_qkv;
+    params_v2.q_stride_in_bytes = params_v2.k_stride_in_bytes = params_v2.v_stride_in_bytes =
+        get_size_in_bytes((h + 2 * h_kv) * d, DATA_TYPE_E4M3);
+  }
+
+#if defined(DEBUG_HAS_PRINT_BUFFER)
+  auto& params = params_v2;
+  constexpr size_t bytes = 32 * 1024;
+  void* print_ptr = nullptr;
+  FMHA_CHECK_CUDA(cudaMalloc(&params.print_ptr, bytes));
+  std::vector<float> print_buffer(bytes / sizeof(float));
+#endif
+  // Run a few warm-up kernels.
+  for (int ii = 0; ii < warm_up_runs; ++ii) {
+    if (v1) {
+      run_fmha_v1(params_v1, launch_params, data_type, output_dtype, sm, 0);
+    } else {
+      run_fmha_v2(params_v2, launch_params, data_type, output_dtype, sm, 0);
+    }
+  }
+  printf("Warm-up kernels done\n");
+  FMHA_CHECK_CUDA(cudaPeekAtLastError());
+
+  float non_fused_elapsed = INFINITY;
+  printf("Running reference kernel\n");
+  if (!skip_checks) {
+    // Run cuBLAS.
+
+    RefBMM bmm1(data_type_to_cuda(data_type),   // a
+                data_type_to_cuda(data_type),   // b
+                data_type_to_cuda(acc_type),    // d
+                data_type_to_cublas(acc_type),  // compute
+                data_type_to_cuda(acc_type),    // scale
+                false,                          // Q
+                true,                           // K'
+                s,                              // m
+                s,                              // n
+                d,                              // k
+                b * h * (2 * d + dv),           // ld Q
+                b * h * (2 * d + dv),           // ld K
+                b * h * s,                      // ld P
+                (2 * d + dv),                   // stride Q
+                (2 * d + dv),                   // stride K
+                s,                              // stride P
+                b * h                           // batch count
+    );
+
+    /*
+    RefBMM bmm2(data_type_to_cuda(data_type), // a
+                data_type_to_cuda(data_type), // b
+                data_type_to_cuda(acc_type), // d
+                data_type_to_cublas(acc_type), //compute
+                data_type_to_cuda(acc_type), // scale
+                false, // S
+                false, // V
+                s, // m
+                d, // n
+                s, // k
+                b * h * s, // ld S
+                b * h * 3 * d, // ld V
+                b * h * d, // ld O
+                s, // stride S
+                3 * d, // stride V
+                d, // stride O
+                b * h // batch count
+               );
+    */
+
+    // WAR fOR MISSING CUBLAS FP8 NN SUPPORT.
+    // Transpose V, so that we can do a TN BMM2, i.e. O = S x V'  instead of O = S x V.
+    RefBMM bmm2(data_type_to_cuda(data_type),   // a
+                data_type_to_cuda(data_type),   // b
+                data_type_to_cuda(acc_type),    // d
+                data_type_to_cublas(acc_type),  // compute
+                data_type_to_cuda(acc_type),    // scale
+                false,                          // S
+                true,                           // V'
+                s,                              // m
+                dv,                             // n
+                s,                              // k
+                b * h * s,                      // ld S
+                s,                              // ld V
+                b * h * dv,                     // ld O
+                s,                              // stride S
+                s * dv,                         // stride V
+                dv,                             // stride O
+                b * h                           // batch count
+    );
+    timer.start();
+    ground_truth(bmm1, bmm2, data_type, acc_type, scale_bmm1, scale_softmax, scale_bmm2,
+                 softcapping_scale_bmm1, qkv_sbh3d_d,
+                 vt_d,  // WAR pass in V'
+                 mask_d, attention_sinks_d, p_d, s_d, tmp_d, o_d, softmax_stats_d, cu_seqlens_d, b,
+                 s, h, d, dv, runs, warps_m, warps_n, has_alibi);
+    timer.stop();
+    FMHA_CHECK_CUDA(cudaPeekAtLastError());
+    FMHA_CHECK_CUDA(cudaDeviceSynchronize());
+    non_fused_elapsed = timer.millis();
+
+#if defined(STORE_P)
+    FMHA_CHECK_CUDA(cuda_memcpy_d2h(p_ref_h, p_d, p_size, acc_type));
+#endif  // defined(STORE_P)
+
+#if defined(STORE_S)
+    FMHA_CHECK_CUDA(cuda_memcpy_d2h(s_ref_h, s_d, p_size, data_type));
+#endif  // defined(STORE_S)
+
+    // Read the results.
+    FMHA_CHECK_CUDA(cuda_memcpy_d2h(o_ref_h, o_d, o_size, data_type));
+    FMHA_CHECK_CUDA(
+        cuda_memcpy_d2h(softmax_stats_ref_h, softmax_stats_d, 2 * b * s * h, DATA_TYPE_FP32));
+  }
+
+  // Fill-in p/s/o with garbage data.
+  // WAR: if sequence is padded, we zero-fill the output buffer as kernel will not write to the
+  // padded area, and the host expects to check the padded area
+  if (!skip_checks) {
+    FMHA_CHECK_CUDA(cudaMemset(p_d, 0xdc, p_size_in_bytes));
+    FMHA_CHECK_CUDA(cudaMemset(s_d, 0xdc, s_size_in_bytes));
+  }
+  FMHA_CHECK_CUDA(cudaMemset(o_d, 0x00, o_size_in_bytes));
+  FMHA_CHECK_CUDA(cudaMemset(softmax_stats_d, 0x00, 2 * b * s * h * sizeof(float)));
+
+  // Run the kernel.
+  timer.start();
+  for (int ii = 0; ii < runs; ++ii) {
+    if (v1) {
+      run_fmha_v1(params_v1, launch_params, data_type, output_dtype, sm, 0);
+    } else {
+      run_fmha_v2(params_v2, launch_params, data_type, output_dtype, sm, 0);
+    }
+  }
+  timer.stop();
+  FMHA_CHECK_CUDA(cudaPeekAtLastError());
+
+  FMHA_CHECK_CUDA(cudaDeviceSynchronize());
+  float fused_elapsed = timer.millis();
+
+#if defined(STORE_P)
+  FMHA_CHECK_CUDA(cuda_memcpy_d2h(p_h, p_d, p_size, acc_type));
+  printf("\nChecking .....: P = norm * K^T * Q\n");
+
+  // DEBUG.
+  printf("seqlens[0]=%d\n", seqlens[0]);
+  // END OF DEBUG.
+
+  // Clear the invalid region of P.
+  set_mat<float>(p_ref_h, seqlens, s, b, h, s, 0.f, true);
+  set_mat<float>(p_h, seqlens, s, b, h, s, 0.f, true);
+
+  // Do the check.
+  check_results(p_h, p_ref_h, s, s * b * h, s, 0.f, true, true);
+#endif  // defined(STORE_P)
+
+#if defined(STORE_S)
+  FMHA_CHECK_CUDA(cuda_memcpy_d2h(s_h, s_d, p_size, data_type));
+  printf("\nChecking .....: S = softmax(P)\n");
+#if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+  float softmax_epsilon = data_type == DATA_TYPE_FP16 ? 1e-3f : 0.f;
+#else
+  float softmax_epsilon = 1.e-3f;
+#endif  // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+
+  // Clear the invalid region of S.
+  set_mat<float>(s_ref_h, seqlens, s, b, h, s, 0.f);
+  set_mat<float>(s_h, seqlens, s, b, h, s, 0.f);
+
+  // Do the check.
+  check_results(s_h, s_ref_h, s, s * b * h, s, softmax_epsilon, true, true);
+#endif  // defined(STORE_S)
+
+  // Check the final results.
+  int status = -1;
+  if (skip_checks) {
+    status = 0;
+    printf("\n");
+    print_results(true, false);
+  } else {
+    if (v1) {
+      FMHA_CHECK_CUDA(cuda_memcpy_d2h(o_h, o_d, o_size, output_dtype));
+      status = check_results(o_h, o_ref_h, d, s * b * h, d, epsilon, verbose, true);
+    } else {
+      std::vector<float> o_ref_trans_h(o_size);
+
+      FMHA_CHECK_CUDA(cuda_memcpy_d2h(o_h, o_d_view, o_view_size, output_dtype));
+      FMHA_CHECK_CUDA(
+          cuda_memcpy_d2h(softmax_stats_h, softmax_stats_d, 2 * b * s * h, DATA_TYPE_FP32));
+
+      if (interleaved) {
+        // revert batch-interleaved format: 3 x h/32 x total x d x 32 => total x
+        // h x 3 x d
+        x_vec32(false, o_h, h, is_s_padded ? b * h : total, 1);
+      }
+
+      // Extract the last s_q tokens from the output.
+      extract_and_transpose_output<float>(o_ref_trans_h.data(), o_ref_h, seqlens, q_seqlens, s, s_q,
+                                          b, h, dv, is_s_padded);
+      if (verbose) {
+        printf("\nChecking .....: O = V * S\n");
+      }
+      status = check_results(o_h, o_ref_trans_h.data(), dv,
+                             is_s_padded ? s_q * b * h : cu_q_seqlens.back() * h, dv, epsilon,
+                             verbose, true);
+      if (save_softmax) {
+        auto errors = check_softmax_results(softmax_stats_h, softmax_stats_ref_h, b, s, h, seqlens,
+                                            cu_seqlens);
+        status = status | ((errors.first + errors.second) > 0);
+      }
+    }
+    if (status != 0) {  // if there was an error, print the config of the run
+      printf("v1=%d il=%d s=%lu b=%lu h=%lu dv=%lu dtype=%s\n", v1, interleaved, s, b, h, dv,
+             data_type_to_name(data_type).c_str());
+    }
+    if (!verbose) {  // this just prints the SUCCESS/ERROR line
+      print_results(true, true, status == 0);
+    }
+  }
+
+  // accounts for tensor core flops only; excludes flops spent in softmax
+  size_t total_flops = 0;
+  // remove last seqlen(total_seqlen)
+  seqlens.pop_back();
+  for (auto& s_ : seqlens) {
+    size_t s_size = size_t(s_);
+    total_flops += 2ull * h * (s_q * s_size * d + s_q * dv * s_size);  // 1st BMM + 2nd BMM
+  }
+  total_flops = attention_mask_type == Attention_mask_type::CAUSAL ? total_flops / 2 : total_flops;
+
+  size_t total_bytes = o_packed_size_in_bytes + qkv_packed_size_in_bytes;
+  if (verbose) {
+    // Runtimes.
+    printf("\n");
+    if (!skip_checks) {
+      printf("Non-fused time: %.6f ms\n", non_fused_elapsed / float(runs));
+    }
+    printf("Fused time ...: %.6f us\n", fused_elapsed * 1000 / float(runs));
+    printf("Tensor core ..: %.2f Tflop/s\n", total_flops / (fused_elapsed / float(runs) / 1e-9));
+    printf("Bandwidth ....: %.2f GB/s\n", total_bytes / (fused_elapsed / float(runs) / 1e-6));
+    if (!skip_checks) {
+      printf("Ratio ........: %.2fx\n", non_fused_elapsed / fused_elapsed);
+    }
+  } else {
+    printf("Elapsed ......: %.6f us (%.2fx), %.2f Tflop/s, %.2f GB/s\n",
+           fused_elapsed * 1000 / float(runs), non_fused_elapsed / fused_elapsed,
+           total_flops / (fused_elapsed / float(runs) / 1e-9),
+           total_bytes / (fused_elapsed / float(runs) / 1e-6));
+  }
+#if defined(DEBUG_HAS_PRINT_BUFFER)
+  FMHA_CHECK_CUDA(
+      cuda_memcpy_d2h(print_buffer.data(), params.print_ptr, print_buffer.size(), DATA_TYPE_FP32));
+
+  printf("\n====================\n");
+  for (int it = 0; it < 16; it++) {
+    printf("% .4f ", print_buffer[it]);
+  }
+  printf("\n====================\n");
+
+  FMHA_CHECK_CUDA(cudaFree(params.print_ptr));
+
+#endif
+  // Release memory.
+  FMHA_CHECK_CUDA(cudaFree(qkv_sbh3d_d));
+  FMHA_CHECK_CUDA(cudaFree(qkv_packed_d));
+  FMHA_CHECK_CUDA(cudaFree(scale_bmm2_d));
+  FMHA_CHECK_CUDA(cudaFree(mqa_qkv_d));
+  FMHA_CHECK_CUDA(cudaFree(mqa_qkv_packed_d));
+  FMHA_CHECK_CUDA(cudaFree(qkv_bsh3d_d));
+  FMHA_CHECK_CUDA(cudaFree(mask_d));
+  FMHA_CHECK_CUDA(cudaFree(packed_mask_d));
+  FMHA_CHECK_CUDA(cudaFree(q_d));
+  FMHA_CHECK_CUDA(cudaFree(k_d));
+  FMHA_CHECK_CUDA(cudaFree(v_d));
+  FMHA_CHECK_CUDA(cudaFree(p_d));
+  FMHA_CHECK_CUDA(cudaFree(s_d));
+  FMHA_CHECK_CUDA(cudaFree(o_d));
+  FMHA_CHECK_CUDA(cudaFree(tmp_d));
+  FMHA_CHECK_CUDA(cudaFree(cu_seqlens_d));
+  FMHA_CHECK_CUDA(cudaFree(cu_mask_rows_d));
+  FMHA_CHECK_CUDA(cudaFree(max_scratch_d));
+  FMHA_CHECK_CUDA(cudaFree(sum_scratch_d));
+  FMHA_CHECK_CUDA(cudaFree(o_scratch_d));
+  FMHA_CHECK_CUDA(cudaFree(counters_d));
+  FMHA_CHECK_CUDA(cudaFree(tile_id_counter_d));
+  FMHA_CHECK_CUDA(cudaFree(kv_cache_pool_ptr));
+  FMHA_CHECK_CUDA(cudaFree(kv_cache_block_offsets_d));
+  FMHA_CHECK_CUDA(cudaFree(contiguous_kv_d));
+  FMHA_CHECK_CUDA(cudaFree(softmax_stats_d));
+
+  free(qkv_h);
+  free(mask_h);
+  free(packed_mask_h);
+  free(s_h);
+  free(o_h);
+  free(o_ref_h);
+  free(softmax_stats_h);
+  free(softmax_stats_ref_h);
+  free(contiguous_kv_h);
+  free(kv_cache_ptrs_h);
+  free(kv_cache_block_offsets_h);
+
+  free(p_ref_h);
+#if defined(STORE_P)
+  free(p_h);
+#endif  // defined(STORE_P)
+  free(s_ref_h);
+
+  return status;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/fmha_v2/fused_multihead_attention.h b/csrc/fmha_v2/fused_multihead_attention.h
new file mode 100644
index 0000000000..c1653bb5bb
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_attention.h
@@ -0,0 +1,326 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <cuda.h>
+#include <fmha/alibi_params.h>
+#include <fmha/hopper/tma_types.h>
+#include <fmha/paged_kv_cache.h>
+#include <fused_multihead_attention_utils.h>
+
+#include <vector>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Make sure the mask input is padded to 128 x 256 tile size in order to
+// match all Ampere/Hopper kernels.
+static constexpr int FLASH_ATTEN_MASK_M_ALIGNMENT = 128;
+static constexpr int FLASH_ATTEN_MASK_N_ALIGNMENT = 256;
+// The packed mask's MMA tile size is 64 x 64.
+static constexpr int FLASH_ATTEN_MASK_MMA_M = 64;
+static constexpr int FLASH_ATTEN_MASK_MMA_N = 64;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class Attention_mask_type {
+  // Mask the padded tokens.
+  PADDING = 0,
+  // Mask the padded tokens and all the tokens that come after in a sequence.
+  CAUSAL,
+  // Causal mask + attend to the specific sliding window or chunk.
+  SLIDING_OR_CHUNKED_CAUSAL,
+  // The custom mask input.
+  CUSTOM_MASK,
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline std::string mask_type_to_string(Attention_mask_type mask_type) {
+  switch (mask_type) {
+    case Attention_mask_type::PADDING:
+      return "padding";
+    case Attention_mask_type::CAUSAL:
+      return "causal";
+    case Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL:
+      return "sliding_or_chunked_causal";
+    case Attention_mask_type::CUSTOM_MASK:
+      return "custom_mask";
+    default:
+      assert(false);
+      return "";
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class Attention_input_layout {
+  // QKV are packed into [B, S, 3, H, D] layout.
+  PACKED_QKV = 0,
+  // Q has contiguous [B, S, H, D] layout, while KV has contiguous [B, 2, H, S, D] layout.
+  CONTIGUOUS_Q_KV,
+  // Q has contiguous [B, S, H, D] layout, while paged KV layout are blocks of indices with shape
+  // of [B, 2, Blocks_per_Seq], and the indice indicates the block distance to the pool ptr in
+  // global memory.
+  Q_PAGED_KV,
+  // Q has [B, S, H, D] layout,
+  // K has [B, S, H_kv, D] layout,
+  // V has [B, S, H_kv, Dv] layout,
+  SEPARATE_Q_K_V,
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline std::string attention_input_layout_to_string(Attention_input_layout layout) {
+  switch (layout) {
+    case Attention_input_layout::PACKED_QKV:
+      return "packed_qkv";
+    case Attention_input_layout::CONTIGUOUS_Q_KV:
+      return "contiguous_q_kv";
+    case Attention_input_layout::Q_PAGED_KV:
+      return "contiguous_q_paged_kv";
+    case Attention_input_layout::SEPARATE_Q_K_V:
+      return "separate_q_k_v";
+    default:
+      assert(false);
+      return "";
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace bert {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if USE_DEMO_BERT_PARAMS
+
+// TODO TRT plugins use a different parameter struct taken from the old XMMA fork.
+//      Until all cubins in the plugin are replaced with new kernels, we need to conform to that.
+#include <fused_multihead_attention_demo_bert_params.h>
+
+#else
+struct Fused_multihead_attention_params_base {
+  // The QKV matrices.
+  void* qkv_ptr;
+  // The O matrix (output).
+  void* o_ptr;
+
+  // The stride between rows of O.
+  int64_t o_stride_in_bytes;
+
+#if defined(STORE_P)
+  // The pointer to the P matrix (for debugging).
+  void* p_ptr;
+  // The stride between rows of the P matrix (for debugging).
+  int64_t p_stride_in_bytes;
+#endif  // defined(STORE_P)
+
+#if defined(STORE_S)
+  // The pointer to the S matrix (for debugging).
+  void* s_ptr;
+  // The stride between rows of the S matrix (for debugging).
+  int64_t s_stride_in_bytes;
+#endif  // defined(STORE_S)
+
+#if defined(DEBUG_HAS_PRINT_BUFFER)
+  void* print_ptr;
+#endif
+
+  // The dimensions.
+  int b, h, s, d;
+  // The scaling factors for the kernel.
+  uint32_t scale_bmm1, scale_softmax, scale_bmm2;
+  // The bmm2 scaling factors in the device.
+  uint32_t* scale_bmm1_d;
+  uint32_t* scale_bmm2_d;
+
+  // Do we use Niall's trick to avoid I2F/F2I in the INT8 kernel.
+  bool enable_i2f_trick;
+
+  // true: for int8, instead of doing max reduce, use max value encoded in scale factor
+  bool use_int8_scale_max = false;
+
+  // If the kernel is using alibi or not
+  bool has_alibi = false;
+  fmha::AlibiParams alibi_params;
+
+  // The number of heads computed by one iteration of the wave.
+  int heads_per_wave;
+  // Buffers to perform a global sync and a critical section.
+  int *counters, *max_barriers, *sum_barriers, *locks;
+  // Scratch buffers to finalize softmax.
+  float *max_scratch_ptr, *sum_scratch_ptr;
+  // Scratch buffer to finalize the output (not needed for FP16).
+  int* o_scratch_ptr;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Fused_multihead_attention_params_v1 : Fused_multihead_attention_params_base {
+  // The stride between rows of the Q, K and V matrices.
+  int64_t qkv_stride_in_bytes;
+  // The mask to implement drop-out.
+  void* packed_mask_ptr;
+
+  // The stride between matrices of packed mask.
+  int64_t packed_mask_stride_in_bytes;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Fused_multihead_attention_params_v2 : Fused_multihead_attention_params_base {
+  // The dimension of V. If unset, dv = d.
+  int dv = 0;
+
+  // The input to support any mask patterns.
+  void* packed_mask_ptr;
+  // The mask input's stride in the N (K-seq) dimension.
+  int64_t packed_mask_stride_in_bytes;
+  // The Softmax stats vector of layout [total_tokens_q, h, 2], including softmax_max and
+  // softmax_sum
+  void* softmax_stats_ptr;
+  // The stride between rows of softmax_stats_ptr, default: h * sizeof(float2)
+  int64_t softmax_stats_stride_in_bytes;
+
+  // The attention sinks (per head).
+  float* attention_sinks;
+
+  // array of length b+1 holding prefix sum of actual q sequence lengths.
+  int* cu_q_seqlens;
+  // array of length b+1 holding prefix sum of actual kv sequence lengths.
+  int* cu_kv_seqlens;
+  // array of length b+1 holding prefix sum of actual mask sequence lengths.
+  // it might not be the same as cu_q_seqlens as the mask seqlens will be padded.
+  int* cu_mask_rows;
+
+  // tma descriptors on device.
+  // Either q in packed qkv [B, S, 3, H, D] of separate q layout [B, S, H, D].
+  fmha::cudaTmaDesc tma_desc_q;
+  // Tma descriptors for packed/contiguous/paged kv cache.
+  // Kv in packed qkv layout: [B, S, 3, H, D]
+  // Contiguous kv layout: [B, 2, H, S, D].
+  // Paged kv layout: [UINT32_MAX, H, Tokens_per_block, D].
+  fmha::cudaTmaDesc tma_desc_k;
+  fmha::cudaTmaDesc tma_desc_v;
+  // Tma descriptor for o
+  fmha::cudaTmaDesc tma_desc_o;
+
+  // Contiguous Q buffer pointer [B, S, H, D].
+  void* q_ptr;
+  // The separate K matrice.
+  void* k_ptr;
+  // The separate V matrice.
+  void* v_ptr;
+  // Contiguous KV buffer pointer [B, 2, H, S, D].
+  void* kv_ptr;
+  // Paged KV Cache buffer.
+  fmha::Kv_block_array paged_kv_cache;
+  // Q and KV stride (used by LDGSTS).
+  int64_t q_stride_in_bytes;
+  int64_t k_stride_in_bytes;
+  int64_t v_stride_in_bytes;
+
+  // Paged KV load.
+  int blocks_per_tma_load;
+  int blocks_per_tma_load_log2;
+
+  // M tile id counter for dynamic scheduling
+  uint32_t* tile_id_counter_ptr;
+  uint32_t num_tiles;
+  uint32_t num_tiles_per_head;
+  bool use_balanced_scheduling;
+
+  // In multi-query or grouped-query attention (MQA/GQA), several Q heads are associated with one KV
+  // head
+  int h_kv = 0;
+  // h_q_per_kv is sometimes rematerialized in the kernel by formula h / h_kv to reclaim one
+  // register
+  int h_q_per_kv = 1;
+
+  // The number of grouped heads in the seqlen dimension.
+  int num_grouped_heads = 1;
+
+  // Sliding Window Attention
+  // Only pay attention to [max(0, query_idx - sliding_window_size), query_idx].
+  int sliding_window_size = INT_MAX;
+
+  // The chunked attention size (<= 0 means no chunked attention).
+  int log2_chunked_attention_size = 0;
+
+  // The softcapping scale (scale * tanh (x / scale)) applied to bmm1 output.
+  float softcapping_scale_bmm1 = 0.0f;
+
+  // is input/output padded
+  bool is_s_padded = false;
+
+  struct SageAttention {
+    struct Scales {
+      // this field is only used in bin/fmha.exe, will be omitted in exported cubin
+      int block_size;
+      // ceil(max_seqlen / block_size)
+      int max_nblock;
+      // The scale of each block, layout: (B, H, max_nblock)
+      float* scales;
+    } q, k, v;
+  } sage;
+};
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// flags to control kernel choice
+struct Fused_multihead_attention_launch_params {
+  // flags to control small batch kernel choice
+  // true: never unroll
+  bool ignore_b1opt = false;
+  // true: always unroll
+  bool force_unroll = false;
+  // use fp32 accumulation
+  bool force_fp32_acc = false;
+  // the C/32 format
+  bool interleaved = false;
+  // by default TMA is not used.
+  bool use_tma = false;
+  // total number of q tokens to set tma descriptors
+  int total_q_seqlen = 0;
+  // total number of kv tokens to set tma descriptors
+  int total_kv_seqlen = 0;
+  // if flash attention is used (only FP16)
+  bool flash_attention = false;
+  // if warp_specialized kernels are used (only SM90 HGMMA + TMA)
+  bool warp_specialization = false;
+  // granular tiling flash attention kernels
+  bool use_granular_tiling = false;
+  // causal masking or sliding_or_chunked_causal masking or dense(padding) mask.
+  fmha::Attention_mask_type attention_mask_type = fmha::Attention_mask_type::PADDING;
+  // the attention input layout.
+  fmha::Attention_input_layout attention_input_layout = fmha::Attention_input_layout::PACKED_QKV;
+  // enable_attn_logit_softcapping (choose kernels with softcapping_scale_bmm1).
+  bool enable_attn_logit_softcapping = false;
+  // harward properties to determine how to launch blocks
+  int multi_processor_count = 0;
+  int device_l2_cache_size = 0;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace bert
diff --git a/csrc/fmha_v2/fused_multihead_attention_demo_bert_params.h b/csrc/fmha_v2/fused_multihead_attention_demo_bert_params.h
new file mode 100644
index 0000000000..bfe40b720f
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_attention_demo_bert_params.h
@@ -0,0 +1,171 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/alibi_params.h>
+#include <fmha/hopper/tma_types.h>
+#include <limits.h>
+
+struct Fused_multihead_attention_params_v1 {
+  // The QKV matrices.
+  void* qkv_ptr;
+  // The mask to implement drop-out.
+  void* packed_mask_ptr;
+  // The O matrix (output).
+  void* o_ptr;
+
+  // The stride between rows of the Q, K and V matrices.
+  int64_t qkv_stride_in_bytes;
+  // The stride between matrices of packed mask.
+  int64_t packed_mask_stride_in_bytes;
+  // The stride between rows of O.
+  int64_t o_stride_in_bytes;
+
+#if defined(STORE_P)
+  // The pointer to the P matrix (for debugging).
+  void* p_ptr;
+  // The stride between rows of the P matrix (for debugging).
+  int64_t p_stride_in_bytes;
+#endif  // defined(STORE_P)
+
+#if defined(STORE_S)
+  // The pointer to the S matrix (for debugging).
+  void* s_ptr;
+  // The stride between rows of the S matrix (for debugging).
+  int64_t s_stride_in_bytes;
+#endif  // defined(STORE_S)
+
+  // The dimensions.
+  int b, h, s, d;
+  // The scaling factors for the kernel.
+  uint32_t scale_bmm1, scale_softmax, scale_bmm2;
+
+  // Do we use Niall's trick to avoid I2F/F2I in the INT8 kernel.
+  bool enable_i2f_trick;
+
+  // true: for int8, instead of doing max reduce, use max value encoded in scale factor
+  bool use_int8_scale_max = false;
+
+  // If the kernel is using alibi or not
+  bool has_alibi = false;
+  fmha::AlibiParams alibi_params{};
+
+  // The number of heads computed by one iteration of the wave.
+  int heads_per_wave;
+  // Buffers to perform a global sync and a critical section.
+  int *counters, *max_barriers, *sum_barriers, *locks;
+  // Scratch buffers to finalize softmax.
+  float *max_scratch_ptr, *sum_scratch_ptr;
+  // Scratch buffer to finalize the output (not needed for FP16).
+  int* o_scratch_ptr;
+};
+
+struct Fused_multihead_attention_params_v2 {
+  // The packed QKV matrices.
+  void* qkv_ptr;
+  // The separate Q matrice.
+  void* q_ptr;
+  // The separate K matrice.
+  void* k_ptr;
+  // The separate V matrice.
+  void* v_ptr;
+  // The separate KV matrice (contiguous KV).
+  void* kv_ptr;
+  // The separate paged kv cache.
+  fmha::Kv_block_array paged_kv_cache;
+  // The mask to implement drop-out.
+  void* packed_mask_ptr;
+  // The attention sinks (per head).
+  float* attention_sinks;
+  // The O matrix (output).
+  void* o_ptr;
+  // The Softmax stats vector of layout [2, B, S, H], including softmax_sum and softmax_max
+  void* softmax_stats_ptr;
+
+  // The stride between rows of Q.
+  int64_t q_stride_in_bytes;
+  // The stride between rows of K.
+  int64_t k_stride_in_bytes;
+  // The stride between rows of V.
+  int64_t v_stride_in_bytes;
+  // The stride between matrices of packed mask.
+  int64_t packed_mask_stride_in_bytes;
+  // The stride between rows of O.
+  int64_t o_stride_in_bytes;
+  // The stride between rows of softmax_stats_ptr
+  int64_t softmax_stats_stride_in_bytes;
+
+  // tma descriptors on device.
+  // Either q in packed qkv [B, S, 3, H, D] of separate q layout [B, S, H, D].
+  fmha::cudaTmaDesc tma_desc_q;
+  // Tma descriptors for packed/contiguous/paged kv cache.
+  // Kv in packed qkv layout: [B, S, 3, H, D]
+  // Contiguous kv layout: [B, 2, H, S, D].
+  // Paged kv layout: [UINT32_MAX, H, Tokens_per_block, D].
+  fmha::cudaTmaDesc tma_desc_k;
+  fmha::cudaTmaDesc tma_desc_v;
+  // Tma descriptor for o
+  fmha::cudaTmaDesc tma_desc_o;
+
+  // Tma load of paged kv cache.
+  int blocks_per_tma_load;
+  int blocks_per_tma_load_log2;
+
+  // The dimensions. In ordinary multi-head attention (MHA), there are equal number of QKV heads
+  int b, h, h_kv, h_q_per_kv, s, d;
+  // The dimension of V. If unset, dv = d.
+  int dv = 0;
+  // The number of grouped heads in the seqlen dimension.
+  int num_grouped_heads = 1;
+  // Sliding Window Attention
+  // Only pay attention to [max(0, query_idx - sliding_window_size), query_idx].
+  int sliding_window_size = INT_MAX;
+  // The chunked attention size in log2 (> 0 means that chunked attention is enabled).
+  int log2_chunked_attention_size = 0;
+  // The scaling factors for the kernel.
+  uint32_t scale_bmm1, softcapping_scale_bmm1, scale_softmax, scale_bmm2;
+
+  // The scaling factors in the device memory (required by TRT-LLM + FP8 FMHA).
+  uint32_t* scale_bmm1_d;
+  uint32_t* scale_bmm2_d;
+
+  // array of length b+1 holding prefix sum of actual q sequence lengths.
+  int* cu_q_seqlens;
+  // array of length b+1 holding prefix sum of actual kv sequence lengths.
+  int* cu_kv_seqlens;
+  // array of length b+1 holding prefix sum of actual mask sequence lengths.
+  // it might not be the same as cu_q_seqlens as the mask seqlens will be padded.
+  int* cu_mask_rows;
+
+  // If the kernel is using alibi or not
+  bool has_alibi = false;
+  fmha::AlibiParams alibi_params{};
+
+  // M tile id counter for dynamic scheduling
+  uint32_t* tile_id_counter_ptr;
+  uint32_t num_tiles;
+  uint32_t num_tiles_per_head;
+  bool use_balanced_scheduling;
+
+  // is input/output padded
+  bool is_s_padded = false;
+
+  struct SageAttention {
+    struct Scales {
+      // ceil(max_seqlen / block_size)
+      int max_nblock;
+      // The scale of each block, layout: (B, H, max_nblock)
+      float* scales;
+    } q, k, v;
+  } sage;
+};
diff --git a/csrc/fmha_v2/fused_multihead_attention_kernel.h b/csrc/fmha_v2/fused_multihead_attention_kernel.h
new file mode 100644
index 0000000000..74b551bbe9
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_attention_kernel.h
@@ -0,0 +1,237 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/gmem_tile_o.h>
+#include <fmha/gmem_tile_o_packed.h>
+#include <fmha/gmem_tile_ps.h>
+#include <fmha/gmem_tile_qkv.h>
+#include <fmha/gmem_tile_qkv_packed.h>
+#include <fmha/mask.h>
+#include <fmha/smem_tile_o.h>
+#include <fmha/smem_tile_qkv.h>
+#include <fmha/smem_tile_v.h>
+#include <fmha/softmax.h>
+#include <fused_multihead_attention.h>
+#include <fused_multihead_cross_attention.h>
+
+namespace fused_multihead_attention {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// The kernel implemented here reads the matrices K, Q and V of size (column-major):
+//
+// - K : EMBEDDING_SIZE * SEQUENCE_LENGTH (64 * 384 for BERT-Large),
+// - Q : EMBEDDING_SIZE * SEQUENCE_LENGTH (64 * 384 for BERT-Large),
+// - V : EMBEDDING_SIZE * SEQUENCE_LENGTH (64 * 384 for BERT-Large),
+//
+// It does the following operations:
+//
+// - P = norm * K^T * Q , where norm is the normalization term (a scalar),
+// - S = Softmax(P) over the columns of P,
+// - O = V * S.
+//
+// The intermediate matrices have the following sizes (column-major):
+//
+// - P : SEQUENCE_LENGTH * SEQUENCE_LENGTH (384 * 384 for BERT-Large),
+// - O : EMBEDDING_SIZE  * SEQUENCE_LENGTH ( 64 * 384 for BERT-Large).
+//
+// To be able to hold the matrices on the SM we iterate over the SEQUENCE_LENGHT dimension of the
+// O matrix (i.e. its columns). The matrices K and V are kept in registers on the SM whereas Q is
+// read over the different iterations of the loop.
+//
+// To be able to operate entirely from registers (on Turing and Ampere) for the V * S product, we
+// actually compute P^T = Q^T * K (remember that (AB)^T = B^T A^T).
+//
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// U T I L S
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int FMHA_VERSION>
+struct Single_cta {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Single_cta<1> {
+  // Ctor.
+  template <typename Params>
+  inline __device__ Single_cta(Params const& params, int bidb, int bidh, int bidn, int tidx)
+      : bidb(bidb), bidh(bidh), bidn(bidn) {
+    sum_s = params.b * params.s;
+    actual_seqlen = params.s;
+    bidx = bidb * params.h + bidh;
+  }
+
+  // Should we do an early exit? No.
+  inline __device__ bool stop_early(int = 0) const { return false; }
+
+  // The length of the sequence.
+  int actual_seqlen;
+  // The indices of the block (batch, head, linear index).
+  int bidb, bidh, bidn, bidx;
+  // The total number of tokens.
+  int sum_s;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Single_cta<2> {
+  // Ctor for fmhca params. TODO: consolidate
+  template <typename Params>
+  inline __device__ Single_cta(Params const& params, int bidb, int bidh, int bidn, int tidx)
+      : bidb(bidb), bidh(bidh), bidn(bidn), num_heads(params.h) {
+    sum_s = params.cu_seqlens[bidb];
+    actual_seqlen = params.cu_seqlens[bidb + 1] - sum_s;
+    bidx = sum_s * params.h + bidh;
+  }
+
+  // Ctor.
+  inline __device__ Single_cta(bert::Fused_multihead_attention_params_v2 const& params, int bidb,
+                               int bidh, int bidn, int tidx)
+      : bidb(bidb), bidh(bidh), bidn(bidn), num_heads(params.h) {
+    if (params.is_s_padded) {
+      sum_s = params.s * bidb;
+      // FIXME: might need s_kv here.
+      sum_s_kv = params.s * bidb;
+    } else {
+      sum_s = params.cu_q_seqlens[bidb];
+      sum_s_kv = params.cu_kv_seqlens[bidb];
+    }
+    actual_q_seqlen = params.cu_q_seqlens[bidb + 1] - params.cu_q_seqlens[bidb];
+    actual_kv_seqlen = params.cu_kv_seqlens
+                           ? (params.cu_kv_seqlens[bidb + 1] - params.cu_kv_seqlens[bidb])
+                           : actual_q_seqlen;
+    actual_seqlen = actual_kv_seqlen;
+    sum_mask_row = params.cu_mask_rows ? params.cu_mask_rows[bidb] : sum_s;
+    bidx = sum_s * params.h + bidh;
+  }
+
+  // Skip empty sequences.
+  inline __device__ bool stop_early(int loop = 0) const { return loop >= actual_q_seqlen; }
+
+  // The length of the sequence.
+  int actual_q_seqlen = 0;
+  int actual_kv_seqlen = 0;
+  // Keep for compatibility (it is the same as actual_kv_seqlen).
+  int actual_seqlen = 0;
+  // The total number of mask rows.
+  int sum_mask_row;
+  // The indices of the block (batch, head, linear index).
+  int bidb, bidh, bidn, bidx;
+  // The total number of q tokens.
+  int sum_s;
+  // The total number of kv tokens.
+  int sum_s_kv;
+  int num_heads;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int VERSION>
+struct Multi_cta : public Single_cta<VERSION> {
+  // The base class.
+  using Base = Single_cta<VERSION>;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Multi_cta(Params const& params, int bidb, int bidh, int bidn, int tidx)
+      : Base(params, bidb, bidh, bidn, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Layout [Batch, Sequence Length]
+template <int THREADS_PER_CTA, bool SEQUENCES_INTERLEAVED = false>
+struct Block_info_padded {
+  template <typename Params>
+  __device__ inline Block_info_padded(Params const& params, int const bidb, int const bidh,
+                                      int const tidx)
+      : bidb(bidb), bidh(bidh), bidn(0), num_heads(params.h) {
+    hidx = bidb * params.h + bidh;
+
+    // The block index.
+    sum_s = params.cu_seqlens[bidb];
+    // actual_seqlen = params.seqlens[bidb];
+    actual_seqlen = params.cu_seqlens[bidb + 1] - sum_s;
+    bidx = sum_s * params.h + bidh;
+
+    tidx_global = hidx * THREADS_PER_CTA + tidx;
+  }
+
+  __device__ inline bool stop_early() const { return actual_seqlen == 0; }
+
+  template <int M_PER_ITER>
+  __device__ inline int get_steps(int const begin) const {
+    return ((actual_seqlen - begin) + M_PER_ITER - 1) / M_PER_ITER;
+  }
+
+  int actual_seqlen;
+  int bidx;
+  int sum_s;
+  int bidh;
+  int bidb;
+  int bidn;
+  int hidx;
+  int num_heads;
+  int tidx_global;
+  int next_seq_offset_factor = 1;
+};
+
+// Layout [Sequence Length, Batch]
+template <int THREADS_PER_CTA>
+struct Block_info_padded<THREADS_PER_CTA, true> {
+  template <typename Params>
+  __device__ inline Block_info_padded(Params const& params, int const bidb, int const bidh,
+                                      int const tidx)
+      : bidb(bidb), bidh(bidh), bidn(0), num_heads(params.h) {
+    hidx = bidb * params.h + bidh;
+
+    // The block index.
+    sum_s = bidb;
+    // actual_seqlen = params.seqlens[bidb];
+    actual_seqlen = params.cu_seqlens[bidb + 1] - params.cu_seqlens[bidb];
+    bidx = sum_s * params.h + bidh;
+
+    next_seq_offset_factor = params.b;
+
+    tidx_global = hidx * THREADS_PER_CTA + tidx;
+  }
+
+  __device__ inline bool stop_early() const { return actual_seqlen == 0; }
+
+  template <int M_PER_ITER>
+  __device__ inline int get_steps(int const begin) const {
+    return ((actual_seqlen - begin) + M_PER_ITER - 1) / M_PER_ITER;
+  }
+
+  int actual_seqlen;
+  int bidx;
+  int sum_s;
+  int bidh;
+  int bidb;
+  int bidn;
+  int hidx;
+  int num_heads;
+  int tidx_global;
+  int next_seq_offset_factor;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fused_multihead_attention
diff --git a/csrc/fmha_v2/fused_multihead_attention_kernel_1xN.h b/csrc/fmha_v2/fused_multihead_attention_kernel_1xN.h
new file mode 100644
index 0000000000..0deaff5e88
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_attention_kernel_1xN.h
@@ -0,0 +1,360 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/gemm.h>
+#include <fmha/kernel_traits.h>
+#include <fused_multihead_attention_kernel.h>
+
+namespace fused_multihead_attention {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_traits, typename Params>
+inline __device__ void device_1xN(Params const& params) {
+  // The instruction traits.
+  using Traits_p = typename Kernel_traits::Traits_p;
+  using Traits_o = typename Kernel_traits::Traits_o;
+
+  // The description of the CTA tile for the 1st batched GEMM.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  // The description of the CTA tile for the 2nd batched GEMM.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = typename Traits_p::template Mma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = typename Traits_o::template Mma_tile<Cta_tile_o>;
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
+  // The shared memory tile to swizzle Q.
+  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
+  // The shared memory tile to swizzle K.
+  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
+  // The shared memory tile to swizzle O.
+  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
+
+  // Do we use LDGSTS for Q, K or V?
+  enum { USE_LDGSTS_Q = Kernel_traits::USE_LDGSTS_Q };
+
+  enum { USE_LDGSTS_K = Kernel_traits::USE_LDGSTS_K };
+
+  enum { USE_LDGSTS_V = Kernel_traits::USE_LDGSTS_V };
+
+  // Do we use LDGSTS for any of the 3 input matrices.
+  enum { USE_LDGSTS = USE_LDGSTS_Q || USE_LDGSTS_K || USE_LDGSTS_V };
+
+  // If either K or V uses LDGSTS, they cannot share a buffer.
+  static_assert(!(USE_LDGSTS_K || USE_LDGSTS_V) || !Kernel_traits::SHARE_SMEM_FOR_K_AND_V, "");
+
+  // Shared memory.
+  extern __shared__ char smem_[];
+
+  // The block index for the batch.
+  int const bidb = blockIdx.y;
+  // The block index for the head.
+  int const bidh = blockIdx.x;
+  // The thread index.
+  int const tidx = threadIdx.x;
+
+  // The block info.
+  Single_cta<Kernel_traits::VERSION> const binfo(params, bidb, bidh, 0, tidx);
+  if (binfo.stop_early()) {
+    return;
+  }
+
+  // Create the object to control the masks.
+  fmha::Mask<Traits_p, Cta_tile_p, Kernel_traits::MASK_VERSION> mask(params, binfo, tidx);
+
+  // Allocate the global memory tile loader for Q.
+  Gmem_tile_q gmem_q(params, 0, binfo, tidx);
+  // Allocate the shared memory tile loader for Q.
+  Smem_tile_q smem_q(&smem_[0], tidx);
+
+  // Allocate the global memory tile loader for K.
+  Gmem_tile_k gmem_k(params, 1, binfo, tidx);
+  // Allocate the shared memory tile loader for K.
+  Smem_tile_k smem_k(&smem_[Smem_tile_q::BYTES_PER_TILE], tidx);
+
+  // Allocate the global memory tile loader for V.
+  Gmem_tile_v gmem_v(params, 2, binfo, tidx);
+
+  // The base pointer of smem_v;
+  char* smem_v_ = nullptr;
+  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    smem_v_ = &smem_[Smem_tile_q::BYTES_PER_TILE];
+  } else {
+    smem_v_ = &smem_[Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE];
+  }
+  // Allocate the shared memory tile loader for V. We use the same as K so be careful!!!
+  Smem_tile_v smem_v(smem_v_, tidx);
+
+  // Allocate the global memory tile loader for O.
+  Gmem_tile_o gmem_o(params, binfo, tidx);
+  // Allocate the shared memory tile loader for O. We use the same as K so be careful!!!
+  Smem_tile_o smem_o(&smem_[Smem_tile_q::BYTES_PER_TILE], tidx);
+
+  // Trigger the loads for Q.
+  gmem_q.load(smem_q);
+  // Trigger the loads for K.
+  gmem_k.load(smem_k);
+  // Trigger the loads for K.
+  gmem_v.load(smem_v);
+
+  // Push the LDGDEPBAR instruction after the loads for Q, K and V.
+  fmha::ldgdepbar<USE_LDGSTS>();
+
+  // Commit the data for Q and K to shared memory.
+  gmem_q.commit(smem_q);
+  gmem_k.commit(smem_k);
+
+  // Commit the data for V to shared memory.
+  if (!Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    gmem_v.commit(smem_v);
+  }
+
+  // Make sure the data is in shared memory.
+  fmha::depbar<USE_LDGSTS, 1>();
+  __syncthreads();
+
+  // Load the fragments for Q.
+  typename Smem_tile_q::Fragment frag_q[2][Mma_tile_p::MMAS_M];
+  smem_q.load(frag_q[0], 0);
+
+  // Load the fragments for K. We keep the data in registers during the entire kernel.
+  typename Smem_tile_k::Fragment frag_k[Mma_tile_p::MMAS_K][Mma_tile_p::MMAS_N];
+#pragma unroll
+  for (int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki) {
+    smem_k.load(frag_k[ki], ki);
+  }
+
+  // Commit the data for V to shared memory if it has not been done already.
+  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    // Make sure we are done loading the fragments for K.
+    __syncthreads();
+
+    // Commit the data to shared memory for V.
+    gmem_v.commit(smem_v);
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+  }
+
+  // Load the fragments for V. We keep the data in registers during the entire kernel.
+  typename Smem_tile_v::Fragment frag_v[Mma_tile_o::MMAS_K][Mma_tile_o::VALID_MMAS_N];
+#pragma unroll
+  for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+    smem_v.load(frag_v[ki], ki);
+  }
+
+  // Store/load P to/from memory (for debugging).
+#if defined(STORE_P)
+  enum { BITS_PER_ELT_P = sizeof(typename Traits_p::Accumulator_type) * 8 };
+
+  using Gmem_tile_p = fmha::Gmem_tile_p<Traits_p, Cta_tile_p, BITS_PER_ELT_P>;
+  Gmem_tile_p gmem_p(params.p_ptr, params.p_stride_in_bytes, params.scale_bmm1, tidx);
+#endif
+
+  // Store S to memory (for debugging). NOTE: We use A_type as C_type is int32 for IMMA???
+#if defined(STORE_S)
+  enum { BITS_PER_ELT_S = sizeof(typename Traits_p::A_type) * 8 };
+
+  using Gmem_tile_s = fmha::Gmem_tile_s<Traits_p, Cta_tile_p, BITS_PER_ELT_S>;
+  Gmem_tile_s gmem_s(params.s_ptr, params.s_stride_in_bytes, params.scale_softmax, tidx);
+#endif
+
+  // Create the object to do the softmax.
+  using Softmax = fmha::Softmax<Traits_p, Cta_tile_p, Kernel_traits>;
+  Softmax softmax(params, &smem_[Smem_tile_q::BYTES_PER_TILE], bidb, tidx);
+
+  // The number of threads per row.
+  // enum { THREADS_PER_ROW = Cta_tile_p::WARPS_N * 8 };
+  // DEBUG.
+  // static_assert(THREADS_PER_ROW == 32, "");
+  // END OF DEBUG.
+  enum { THREADS_PER_ROW = 32 };
+
+  // Load over the entire sequence length.
+  for (int loop = 0, outer = 0; loop < Cta_tile_p::N; loop += Cta_tile_p::M, outer++) {
+    // If we have reached the length of the sequence, stop earlier.
+    if (loop >= binfo.actual_seqlen) {
+      break;
+    }
+
+    // Declare the accumulators for the 1st gemm.
+    fmha::Fragment_accumulator<Traits_p> acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
+    using Acc_type_p = typename Traits_p::Accumulator_type;
+    fmha::Clear_accumulator<Acc_type_p, Cta_tile_p::WARPS_K>::apply(acc_p);
+
+// Do this part of P^T = (Q * K^T)^T.
+#pragma unroll
+    for (int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki) {
+      // Trigger the load from shared memory for the next series of Q values.
+      smem_q.load(frag_q[ki & 1], ki);
+      // Do the math for the values already in registers.
+      if (ki <= Mma_tile_p::VALID_MMAS_K) {
+        fmha::gemm(acc_p, frag_q[(ki - 1) & 1], frag_k[(ki - 1)]);
+      }
+    }
+
+    // Do the final stage of math.
+    if (Mma_tile_p::MMAS_K <= Mma_tile_p::VALID_MMAS_K) {
+      int ki = Mma_tile_p::MMAS_K;
+      fmha::gemm(acc_p, frag_q[(ki - 1) & 1], frag_k[(ki - 1)]);
+    }
+
+    // Store the P matrix.
+#if defined(STORE_P)
+    gmem_p.store(acc_p);
+#endif
+
+    // Load the mask for that iteration.
+    mask.load(outer);
+
+    // Convert from the accumulator type to FP32 for Softmax.
+    softmax.unpack(acc_p);
+
+    // Apply the mask.
+    if (params.has_alibi) {
+      softmax.apply_mask_alibi(mask, bidh, params.alibi_params);
+    } else {
+      softmax.apply_mask(mask);
+    }
+
+    // Make sure we are done reading the data.
+    if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V && loop == 0) {
+      __syncthreads();
+    }
+
+    // Enable our trick to use the max for INT8 to scale.
+    if (Kernel_traits::USE_SCALE_MAX) {
+      // 16129 == 127 ^ 2.
+      float p_max = reinterpret_cast<float const&>(params.scale_bmm1) * 16129.f;
+      softmax.apply_exp(p_max);
+    } else {
+      // Compute the max.
+      float p_max[Softmax::ROWS_PER_THREAD];
+      softmax.template reduce<fmha::Max_>(p_max);
+
+      // Make sure we are done reading shared memory.
+      __syncthreads();
+
+      // Compute the exponential value.
+      softmax.apply_exp(p_max);
+    }
+
+    // Compute the sum.
+    float p_sum[Softmax::ROWS_PER_THREAD];
+    softmax.template reduce<fmha::Sum_>(p_sum);
+
+    // Finalize softmax on the accumulators of P^T.
+    softmax.scale(p_sum);
+
+    // Store the P matrix.
+#if defined(STORE_S)
+    softmax.store(gmem_s);
+#endif
+
+#if defined(STORE_P)
+    gmem_p.move();
+#endif
+
+#if defined(STORE_S)
+    gmem_s.move();
+#endif
+
+    // Trigger the load for the next Q values.
+    if (loop + Cta_tile_p::M < Cta_tile_p::N) {
+      smem_q.move_to_next_write_buffer();
+      gmem_q.move();
+      gmem_q.load(smem_q);
+    }
+
+    // Make sure we have the LDGDEPBAR in place.
+    fmha::ldgdepbar<USE_LDGSTS_Q>();
+
+    // Repack for the next BMM.
+    fmha::Fragment_a<Traits_p, fmha::Row> frag_p[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
+    softmax.pack(frag_p);
+
+    // Declare the accumulators for the 1st gemm.
+    fmha::Fragment_accumulator<Traits_o> acc_o[Mma_tile_o::MMAS_M][Mma_tile_o::VALID_MMAS_N];
+    using Acc_type_o = typename Traits_o::Accumulator_type;
+    fmha::Clear_accumulator<Acc_type_o, Cta_tile_o::WARPS_K>::apply(acc_o);
+
+// Do this part of O = P^T * V^T.
+#pragma unroll
+    for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+      fmha::gemm(acc_o, frag_p[ki], frag_v[ki]);
+    }
+
+// Loop over MMAS_M.
+#pragma unroll
+    for (int ii = 0; ii < Gmem_tile_o::LOOPS; ++ii) {
+      // Swizzle the elements and do the final reduction.
+      smem_o.store(acc_o, ii);
+
+      // Make sure the data is in shared memory.
+      __syncthreads();
+
+      // Load from shared memory.
+      uint4 out[Gmem_tile_o::STGS_PER_LOOP];
+      smem_o.load(out);
+
+      // Make sure the data was read from shared memory.
+      if (ii < Gmem_tile_o::LOOPS - 1) {
+        __syncthreads();
+      }
+
+      // Output the values.
+      gmem_o.store(out, ii);
+    }
+
+    // Move to the next part of the output.
+    gmem_o.move();
+
+    // Commit the values for Q into shared memory.
+    if (loop + Cta_tile_p::M < Cta_tile_p::N) {
+      gmem_q.commit(smem_q);
+    }
+
+    // Make sure we are reading from the correct buffer.
+    if (USE_LDGSTS_Q) {
+      smem_q.move_to_next_read_buffer();
+    }
+
+    // Make sure the data is in shared memory.
+    fmha::depbar<USE_LDGSTS_Q, 1>();
+    __syncthreads();
+
+    // Trigger the loads for the values of Q for the next iteration.
+    smem_q.load(frag_q[0], 0);
+
+  }  // Outer loop over the sequence length.
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fused_multihead_attention
diff --git a/csrc/fmha_v2/fused_multihead_attention_kernel_1xN_multi_cta.h b/csrc/fmha_v2/fused_multihead_attention_kernel_1xN_multi_cta.h
new file mode 100644
index 0000000000..9822eeafd0
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_attention_kernel_1xN_multi_cta.h
@@ -0,0 +1,465 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/gemm.h>
+#include <fmha/kernel_traits.h>
+#include <fused_multihead_attention_kernel.h>
+
+namespace fused_multihead_attention {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void spin_wait_(int* barrier, int step, int expected) {
+  // THE FOLLOWING CODE MUST BE EXECUTED BY A SINGLE THREAD IN THE CTA.
+
+  // Update the global counter. Make sure prior writes are visible.
+  asm volatile("red.release.gpu.global.add.s32 [%0], %1;" ::"l"(barrier), "r"(step));
+
+  // Busy wait. We could use found = old + step with old = atomicAdd(...) but it's not faster.
+  for (int found = -1; found != expected;) {
+    asm volatile("ld.global.acquire.gpu.b32 %0, [%1];" : "=r"(found) : "l"(barrier));
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_traits, typename Functor, typename Params>
+inline __device__ float reduce(float x, Params const& params, int* barrier, int bidx, int bidn,
+                               int tidx, bool inc) {
+  // The M dimension of the CTA tile.
+  enum { M = Kernel_traits::Cta_tile_p::M };
+
+  // Make sure it does not exceed the CTA size.
+  static_assert(M <= Kernel_traits::Cta_tile_p::THREADS_PER_CTA, "");
+
+  // The offset to beginning of the scratch space for that head.
+  int const offset = bidx * Kernel_traits::CTAS_PER_HEAD * M + tidx;
+
+  // The scratch buffer for that thread.
+  float* scratch_ptr = Functor::IS_SUM ? params.sum_scratch_ptr : params.max_scratch_ptr;
+
+  // Move to the beginning of the buffer.
+  scratch_ptr += offset;
+
+  // Active threads store their elements to global memory.
+  if (tidx < M) {
+    scratch_ptr[bidn * M] = x;
+  }
+
+  // The step to increment the counters.
+  int step = inc ? 1 : -1;
+  // The expected value.
+  int expected = inc ? Kernel_traits::CTAS_PER_HEAD : 0;
+
+  // Make sure the data is in memory.
+  if (tidx == 0) {
+    spin_wait_(barrier, step, expected);
+  }
+
+  // Make sure all the threads for the block leader (tidx == 0).
+  __syncthreads();
+
+  // Load the element from memory.
+  if (bidn > 0 && tidx < M) {
+    x = *scratch_ptr;
+  }
+
+  // Each CTA does the parallel reduction of values.
+  for (int ii = 1; ii < Kernel_traits::CTAS_PER_HEAD; ++ii) {
+    if (tidx < M) {
+      x = Functor::apply(x, scratch_ptr[ii * M]);
+    }
+  }
+
+  // The final value.
+  return x;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int CTAS_PER_HEAD>
+inline __device__ void acquire_lock_(int* lock, int bidn, int tidx) {
+  // THE FOLLOWING CODE MUST BE EXECUTED BY A SINGLE THREAD IN THE CTA.
+
+  // Poll.
+  for (int found = -1; found != bidn;) {
+    asm volatile("ld.global.acquire.gpu.b32 %0, [%1];" : "=r"(found) : "l"(lock));
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int CTAS_PER_HEAD>
+inline __device__ void release_lock_(int* lock, int bidn, int tidx) {
+  // THE FOLLOWING CODE MUST BE EXECUTED BY A SINGLE THREAD IN THE CTA.
+
+  // Update the global counter. The last CTA resets the counter.
+  if (bidn == CTAS_PER_HEAD - 1) {
+    asm volatile("st.global.release.gpu.b32 [%0], 0;" ::"l"(lock));
+  } else {
+    // atomicAdd(lock, 1);
+    asm volatile("red.release.gpu.global.add.s32 [%0], 1;" ::"l"(lock));
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_traits, typename Params>
+inline __device__ void device_1xN_multi_cta(Params const& params) {
+  // The instruction traits.
+  using Traits_p = typename Kernel_traits::Traits_p;
+  using Traits_o = typename Kernel_traits::Traits_o;
+
+  // The description of the CTA tile for the 1st batched GEMM.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  // The description of the CTA tile for the 2nd batched GEMM.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = typename Traits_p::template Mma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = typename Traits_o::template Mma_tile<Cta_tile_o>;
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
+  // The shared memory tile to swizzle Q.
+  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
+  // The shared memory tile to swizzle K.
+  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
+  // The shared memory tile to swizzle O.
+  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
+
+  // Do we use LDGSTS for Q, K or V?
+  enum { USE_LDGSTS_Q = Kernel_traits::USE_LDGSTS_Q };
+
+  enum { USE_LDGSTS_K = Kernel_traits::USE_LDGSTS_K };
+
+  enum { USE_LDGSTS_V = Kernel_traits::USE_LDGSTS_V };
+
+  // Do we use LDGSTS for any of the 3 input matrices.
+  enum { USE_LDGSTS = USE_LDGSTS_Q || USE_LDGSTS_K || USE_LDGSTS_V };
+
+  // If either K or V uses LDGSTS, they cannot share a buffer.
+  static_assert(!(USE_LDGSTS_K || USE_LDGSTS_V) || !Kernel_traits::SHARE_SMEM_FOR_K_AND_V, "");
+
+  // Shared memory.
+  extern __shared__ char smem_[];
+
+  // The base pointer of smem_v;
+  char* smem_v_ = nullptr;
+  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    smem_v_ = &smem_[Smem_tile_q::BYTES_PER_TILE];
+  } else {
+    smem_v_ = &smem_[Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE];
+  }
+
+  // The block index for the batch * head.
+  int const bidx = blockIdx.y;
+  // The block index for the multi-CTA distribution.
+  int const bidn = blockIdx.x;
+  // The thread index.
+  int const tidx = threadIdx.x;
+
+  // Allocate the shared memory tile loader for Q.
+  Smem_tile_q smem_q(&smem_[0], tidx);
+  // Allocate the shared memory tile loader for K.
+  Smem_tile_k smem_k(&smem_[Smem_tile_q::BYTES_PER_TILE], tidx);
+  // Allocate the shared memory tile loader for V. We use the same as K so be careful!!!
+  Smem_tile_v smem_v(smem_v_, tidx);
+  // Allocate the shared memory tile loader for O. We use the same as K so be careful!!!
+  Smem_tile_o smem_o(&smem_[Smem_tile_q::BYTES_PER_TILE], tidx);
+
+  // Do we increase/decrease the barrier count.
+  bool barrier_inc = true;
+
+  // Outer persistent loop.
+  for (int bidbh = bidx; bidbh < params.b * params.h; bidbh += params.heads_per_wave) {
+    // Decompose the index into b/h. TODO: Should we use fast divmod?
+    int bidb = bidbh / params.h;
+    int bidh = bidbh % params.h;
+
+    // The special structure to control the block info.
+    Multi_cta<Kernel_traits::VERSION> const binfo(params, bidb, bidh, bidn, tidx);
+    if (bidb >= params.b || binfo.stop_early()) {
+      return;
+    }
+
+    // Create the object to control the masks.
+    fmha::Mask<Traits_p, Cta_tile_p, Kernel_traits::MASK_VERSION> mask(params, binfo, tidx);
+
+    // Allocate the global memory tile loader for Q.
+    Gmem_tile_q gmem_q(params, 0, binfo, tidx);
+    // Allocate the global memory tile loader for K.
+    Gmem_tile_k gmem_k(params, 1, binfo, tidx, bidn * Cta_tile_p::N);
+    // Allocate the global memory tile loader for V.
+    Gmem_tile_v gmem_v(params, 2, binfo, tidx, bidn * Cta_tile_p::N);
+    // Allocate the global memory writer for O. Does not depend on bidn as we do a reduction.
+    Gmem_tile_o gmem_o(params, binfo, tidx);
+
+    // Trigger the loads for Q.
+    gmem_q.load(smem_q);
+    // Trigger the loads for K.
+    gmem_k.load(smem_k);
+    // Trigger the loads for V.
+    gmem_v.load(smem_v);
+
+    // Push the LDGDEPBAR instruction after the loads for Q, K and V.
+    fmha::ldgdepbar<USE_LDGSTS>();
+
+    // Commit the data for Q and K to shared memory.
+    gmem_q.commit(smem_q);
+    gmem_k.commit(smem_k);
+
+    // Commit the data for V to shared memory.
+    if (!Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+      gmem_v.commit(smem_v);
+    }
+
+    // Make sure the data is in shared memory.
+    fmha::depbar_<USE_LDGSTS>();
+    __syncthreads();
+
+    // Load the fragments for Q.
+    typename Smem_tile_q::Fragment frag_q[2][Mma_tile_p::MMAS_M];
+    smem_q.load(frag_q[0], 0);
+
+    // Load the fragments for K. We keep the data in registers during the entire kernel.
+    typename Smem_tile_k::Fragment frag_k[Mma_tile_p::MMAS_K][Mma_tile_p::MMAS_N];
+#pragma unroll
+    for (int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki) {
+      smem_k.load(frag_k[ki], ki);
+    }
+
+    // Commit the data for V to shared memory if it has not been done already.
+    if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+      // Make sure we are done loading the fragments for K.
+      __syncthreads();
+
+      // Commit the data to shared memory for V.
+      gmem_v.commit(smem_v);
+
+      // Make sure the data is in shared memory.
+      __syncthreads();
+    }
+
+    // Load the fragments for V. We keep the data in registers during the entire kernel.
+    typename Smem_tile_v::Fragment frag_v[Mma_tile_o::MMAS_K][Mma_tile_o::VALID_MMAS_N];
+#pragma unroll
+    for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+      smem_v.load(frag_v[ki], ki);
+    }
+
+    // Create the object to do the softmax.
+    using Softmax = fmha::Softmax<Traits_p, Cta_tile_p, Kernel_traits>;
+    Softmax softmax(params, &smem_[Smem_tile_q::BYTES_PER_TILE], bidb, tidx);
+
+    // Load over the entire sequence length.
+    for (int loop = 0, outer = 0; loop < binfo.actual_seqlen; loop += Cta_tile_p::M, outer++) {
+      // If we have reached the length of the sequence, stop earlier.
+      if (loop >= binfo.actual_seqlen) {
+        break;
+      }
+
+      // Declare the accumulators for the 1st gemm.
+      fmha::Fragment_accumulator<Traits_p> acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
+      using Acc_type_p = typename Traits_p::Accumulator_type;
+      fmha::Clear_accumulator<Acc_type_p, Cta_tile_p::WARPS_K>::apply(acc_p);
+
+// Do this part of P^T = (Q * K^T)^T.
+#pragma unroll
+      for (int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki) {
+        // Trigger the load from shared memory for the next series of Q values.
+        smem_q.load(frag_q[ki & 1], ki);
+        // Do the math for the values already in registers.
+        if (ki <= Mma_tile_p::VALID_MMAS_K) {
+          fmha::gemm(acc_p, frag_q[(ki - 1) & 1], frag_k[(ki - 1)]);
+        }
+      }
+
+      // Do the final stage of math.
+      if (Mma_tile_p::MMAS_K <= Mma_tile_p::VALID_MMAS_K) {
+        int ki = Mma_tile_p::MMAS_K;
+        fmha::gemm(acc_p, frag_q[(ki - 1) & 1], frag_k[(ki - 1)]);
+      }
+
+      // Load the mask for that iteration.
+      mask.load(outer);
+
+      // Convert from the accumulator type to FP32 for Softmax.
+      softmax.unpack(acc_p);
+
+      // Apply the mask.
+      if (params.has_alibi) {
+        softmax.apply_mask_alibi(mask, bidh, params.alibi_params);
+      } else {
+        softmax.apply_mask(mask);
+      }
+
+      // Make sure we are done reading the data (for K).
+      __syncthreads();
+
+      // Enable our trick to use the max for INT8 to scale. 16129 == 127 ^ 2.
+      if (Kernel_traits::USE_SCALE_MAX) {
+        float p_max = reinterpret_cast<float const&>(params.scale_bmm1) * 16129.f;
+        softmax.apply_exp(p_max);
+      } else {
+        // Compute the max.
+        float partial = softmax.template reduce_<fmha::Max_>();
+        float total = reduce<Kernel_traits, fmha::Max_>(partial, params, &params.max_barriers[bidx],
+                                                        bidx, bidn, tidx, barrier_inc);
+
+        // The global reduction calls syncthreads so we know that shared memory was read.
+        // __syncthreads();
+
+        // Reshuffle the max amongst threads.
+        float p_max[Softmax::ROWS_PER_THREAD];
+        softmax.shuffle(p_max, total);
+
+        // Compute the exponential value.
+        softmax.apply_exp(p_max);
+      }
+
+      // The softmax.shuffle function ends with __syncthreads(). No need for an extra sync!
+      // __syncthreads();
+
+      // Compute the sum.
+      float partial = softmax.template reduce_<fmha::Sum_>();
+      float total = reduce<Kernel_traits, fmha::Sum_>(partial, params, &params.sum_barriers[bidx],
+                                                      bidx, bidn, tidx, barrier_inc);
+      // Switch.
+      barrier_inc = !barrier_inc;
+
+      // The global reduction calls syncthreads so we know that shared memory was read.
+      // __syncthreads();
+
+      // Reshuffle the sum amongst threads.
+      float p_sum[Softmax::ROWS_PER_THREAD];
+      softmax.shuffle(p_sum, total);
+
+      // The softmax.shuffle function ends with __syncthreads(). No need for an extra sync!
+      // __syncthreads();
+
+      // Finalize softmax on the accumulators of P^T.
+      softmax.scale(p_sum);
+
+      // Trigger the load for the next Q values.
+      int do_prefetching = loop + Cta_tile_p::M < binfo.actual_seqlen;
+      if (do_prefetching) {
+        smem_q.move_to_next_write_buffer();
+        gmem_q.move();
+        gmem_q.load(smem_q);
+      }
+
+      // Make sure we have the LDGDEPBAR in place.
+      fmha::ldgdepbar<USE_LDGSTS_Q>();
+
+      // Repack for the next BMM.
+      fmha::Fragment_a<Traits_p, fmha::Row> frag_p[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
+      softmax.pack(frag_p);
+
+      // Declare the accumulators for the 1st gemm.
+      fmha::Fragment_accumulator<Traits_o> acc_o[Mma_tile_o::MMAS_M][Mma_tile_o::VALID_MMAS_N];
+      using Acc_type_o = typename Traits_o::Accumulator_type;
+      fmha::Clear_accumulator<Acc_type_o, Cta_tile_o::WARPS_K>::apply(acc_o);
+
+// Do this part of O = P^T * V^T.
+#pragma unroll
+      for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+        fmha::gemm(acc_o, frag_p[ki], frag_v[ki]);
+      }
+
+      // Declare a lock.
+      int* lock = &params.locks[bidx];
+
+// Loop over MMAS_M.
+#pragma unroll
+      for (int ii = 0; ii < Gmem_tile_o::LOOPS; ++ii) {
+        // Enter the critical section.
+        if (tidx == 0) {
+          acquire_lock_<Kernel_traits::CTAS_PER_HEAD>(lock, bidn, tidx);
+        }
+
+        // Make sure the lock was acquired by the leader. And protect SMEM from previous iter.
+        __syncthreads();
+
+        // Load the previous values from memory.
+        uint4 old[Gmem_tile_o::STGS_PER_LOOP];
+        gmem_o.load(old, ii);
+
+        // Swizzle the elements and do the final reduction.
+        smem_o.store(acc_o, ii);
+
+        // Make sure the data is in shared memory.
+        __syncthreads();
+
+        // Load from shared memory.
+        uint4 out[Gmem_tile_o::STGS_PER_LOOP];
+        smem_o.load(out);
+
+        // Output the values.
+        gmem_o.store(out, old, ii);
+
+        // Make sure the threads are done.
+        __syncthreads();
+
+        // Leave the critical section.
+        if (tidx == 0) {
+          release_lock_<Kernel_traits::CTAS_PER_HEAD>(lock, bidn, tidx);
+        }
+      }
+
+      // Move to the next part of the output.
+      gmem_o.move();
+
+      // Commit the values for Q into shared memory.
+      if (do_prefetching) {
+        gmem_q.commit(smem_q);
+      }
+
+      // Make sure we are reading from the correct buffer.
+      if (USE_LDGSTS_Q && do_prefetching) {
+        smem_q.move_to_next_read_buffer();
+      }
+
+      // Make sure the data is in shared memory.
+      fmha::depbar_<USE_LDGSTS_Q>();
+      __syncthreads();
+
+      // Trigger the loads for the values of Q for the next iteration.
+      if (do_prefetching) {
+        smem_q.load(frag_q[0], 0);
+      }
+
+    }  // Loop over the sequence length
+
+    // Make sure we can start reusing shared memory.
+    __syncthreads();
+
+  }  // Loop over the heads
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fused_multihead_attention
diff --git a/csrc/fmha_v2/fused_multihead_attention_kernel_1xN_noloop.h b/csrc/fmha_v2/fused_multihead_attention_kernel_1xN_noloop.h
new file mode 100644
index 0000000000..eb24a26a5d
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_attention_kernel_1xN_noloop.h
@@ -0,0 +1,316 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/gemm.h>
+#include <fmha/kernel_traits.h>
+#include <fused_multihead_attention_kernel.h>
+
+namespace fused_multihead_attention {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_traits, typename Params>
+inline __device__ void device_1xN_nl(Params const& params) {
+  // The instruction traits.
+  using Traits_p = typename Kernel_traits::Traits_p;
+  using Traits_o = typename Kernel_traits::Traits_o;
+
+  // The description of the CTA tile for the 1st batched GEMM.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  // The description of the CTA tile for the 2nd batched GEMM.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = typename Traits_p::template Mma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = typename Traits_o::template Mma_tile<Cta_tile_o>;
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
+  // The shared memory tile to swizzle Q.
+  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
+  // The shared memory tile to swizzle K.
+  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
+  // The shared memory tile to swizzle O.
+  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
+
+  // Do we use LDGSTS for Q, K or V?
+  enum { USE_LDGSTS_Q = Kernel_traits::USE_LDGSTS_Q };
+
+  enum { USE_LDGSTS_K = Kernel_traits::USE_LDGSTS_K };
+
+  enum { USE_LDGSTS_V = Kernel_traits::USE_LDGSTS_V };
+
+  // Do we use LDGSTS for any of the 3 input matrices.
+  enum { USE_LDGSTS = USE_LDGSTS_Q || USE_LDGSTS_K || USE_LDGSTS_V };
+
+  // If either K or V uses LDGSTS, they cannot share a buffer.
+  static_assert(!(USE_LDGSTS_K || USE_LDGSTS_V) || !Kernel_traits::SHARE_SMEM_FOR_K_AND_V, "");
+
+  // Shared memory.
+  extern __shared__ char smem_[];
+
+  // The loop -- each CTA works on a different loop iteration.
+  int const loop = blockIdx.z;
+  // The block index for the batch.
+  int const bidb = blockIdx.y;
+  // The block index for the head.
+  int const bidh = blockIdx.x;
+  // The thread index.
+  int const tidx = threadIdx.x;
+
+  // The block info. TODO: Take the loop into account!
+  Single_cta<Kernel_traits::VERSION> const binfo(params, bidb, bidh, 0, tidx);
+  if (binfo.stop_early(loop)) {
+    return;
+  }
+
+  // Create the object to control the masks.
+  fmha::Mask<Traits_p, Cta_tile_p, Kernel_traits::MASK_VERSION> mask(params, binfo, tidx);
+
+  // Allocate the global memory tile loader for Q.
+  Gmem_tile_q gmem_q(params, 0, binfo, tidx, loop * Gmem_tile_q::ROWS);
+  // Allocate the shared memory tile loader for Q.
+  Smem_tile_q smem_q(&smem_[0], tidx);
+
+  // Allocate the global memory tile loader for K.
+  Gmem_tile_k gmem_k(params, 1, binfo, tidx);
+  // Allocate the shared memory tile loader for K.
+  Smem_tile_k smem_k(&smem_[Smem_tile_q::BYTES_PER_TILE], tidx);
+
+  // Allocate the global memory tile loader for V.
+  Gmem_tile_v gmem_v(params, 2, binfo, tidx);
+
+  // The base pointer of smem_v;
+  char* smem_v_ = nullptr;
+  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    smem_v_ = &smem_[Smem_tile_q::BYTES_PER_TILE];
+  } else {
+    smem_v_ = &smem_[Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE];
+  }
+
+  // Allocate the shared memory tile loader for V. We use the same as K so be careful!!!
+  Smem_tile_v smem_v(smem_v_, tidx);
+
+  // Allocate the global memory tile loader for O.
+  Gmem_tile_o gmem_o(params, binfo, tidx, loop * Gmem_tile_o::ROWS);
+
+  // Store/load P to/from memory (for debugging).
+#if defined(STORE_P)
+  enum { BITS_PER_ELT_P = sizeof(typename Traits_p::Accumulator_type) * 8 };
+
+  using Gmem_tile_p = fmha::Gmem_tile_ps<Traits_p, Cta_tile_p, BITS_PER_ELT_P>;
+  char* p_ptr = reinterpret_cast<char*>(params.p_ptr);
+  p_ptr += loop * Cta_tile_p::M * params.p_stride_in_bytes;
+  Gmem_tile_p gmem_p(p_ptr, params.p_stride_in_bytes, params.scale_bmm1, tidx);
+#endif
+
+  // Store S to memory (for debugging). NOTE: We use A_type as C_type is int32 for IMMA???
+#if defined(STORE_S)
+  enum { BITS_PER_ELT_S = sizeof(typename Traits_p::A_type) * 8 };
+
+  using Gmem_tile_s = fmha::Gmem_tile_ps<Traits_p, Cta_tile_p, BITS_PER_ELT_S>;
+  char* s_ptr = reinterpret_cast<char*>(params.s_ptr);
+  s_ptr += loop * Cta_tile_p::M * params.s_stride_in_bytes;
+  Gmem_tile_s gmem_s(s_ptr, params.s_stride_in_bytes, params.scale_softmax, tidx);
+#endif
+
+  // Trigger the loads for Q.
+  gmem_q.load(smem_q);
+  // Trigger the loads for K.
+  gmem_k.load(smem_k);
+  // Trigger the loads for K.
+  gmem_v.load(smem_v);
+
+  // Push the LDGDEPBAR instruction after the loads for Q, K and V.
+  fmha::ldgdepbar<USE_LDGSTS>();
+
+  // Commit the data for Q and K to shared memory.
+  gmem_q.commit(smem_q);
+  gmem_k.commit(smem_k);
+
+  // Commit the data for V to shared memory.
+  if (!Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    gmem_v.commit(smem_v);
+  }
+
+  // Make sure the data is in shared memory.
+  fmha::depbar<USE_LDGSTS, 1>();
+  __syncthreads();
+
+  // Load the fragments for Q.
+  typename Smem_tile_q::Fragment frag_q[2][Mma_tile_p::MMAS_M];
+  smem_q.load(frag_q[0], 0);
+
+  // Load the fragments for K. We keep the data in registers during the entire kernel.
+  typename Smem_tile_k::Fragment frag_k[Mma_tile_p::MMAS_K][Mma_tile_p::MMAS_N];
+#pragma unroll
+  for (int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki) {
+    smem_k.load(frag_k[ki], ki);
+  }
+
+  // Commit the data for V to shared memory if it has not been done already.
+  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    // Make sure we are done loading the fragments for K.
+    __syncthreads();
+
+    // Commit the data to shared memory for V.
+    gmem_v.commit(smem_v);
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+  }
+
+  // Load the fragments for V. We keep the data in registers during the entire kernel.
+  typename Smem_tile_v::Fragment frag_v[Mma_tile_o::MMAS_K][Mma_tile_o::VALID_MMAS_N];
+#pragma unroll
+  for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+    smem_v.load(frag_v[ki], ki);
+  }
+
+  // Create the object to do the softmax.
+  using Softmax = fmha::Softmax<Traits_p, Cta_tile_p, Kernel_traits>;
+  Softmax softmax(params, &smem_[Smem_tile_q::BYTES_PER_TILE], bidb, tidx);
+
+  // The number of threads per row.
+  enum { THREADS_PER_ROW = 32 };
+
+  // Declare the accumulators for the 1st gemm.
+  fmha::Fragment_accumulator<Traits_p> acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
+  fmha::Clear_accumulator<typename Traits_p::Accumulator_type, Cta_tile_p::WARPS_K>::apply(acc_p);
+
+// Do this part of P^T = (Q * K^T)^T.
+#pragma unroll
+  for (int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki) {
+    // Trigger the load from shared memory for the next series of Q values.
+    smem_q.load(frag_q[ki & 1], ki);
+    // Do the math for the values already in registers.
+    if (ki <= Mma_tile_p::VALID_MMAS_K) {
+      fmha::gemm(acc_p, frag_q[(ki - 1) & 1], frag_k[(ki - 1)]);
+    }
+  }
+
+  // Do the final stage of math.
+  if (Mma_tile_p::MMAS_K <= Mma_tile_p::VALID_MMAS_K) {
+    int ki = Mma_tile_p::MMAS_K;
+    fmha::gemm(acc_p, frag_q[(ki - 1) & 1], frag_k[(ki - 1)]);
+  }
+
+  // Store the P matrix.
+#if defined(STORE_P)
+  gmem_p.store(acc_p);
+#endif
+
+  // Convert from the accumulator type to FP32 for Softmax.
+  softmax.unpack(acc_p);
+
+  // Move the mask to the correct position. Load extra data if needed.
+  mask.load(loop);
+
+  // Apply the mask.
+  if (params.has_alibi) {
+    softmax.apply_mask_alibi(mask, bidh, params.alibi_params);
+  } else {
+    softmax.apply_mask(mask);
+  }
+
+  // Make sure we are done reading from shared memory.
+  __syncthreads();
+
+  // Apply the INT8 hack.
+  if (Kernel_traits::USE_SCALE_MAX) {
+    // 16129 == 127 ^ 2.
+    float p_max = reinterpret_cast<float const&>(params.scale_bmm1) * 16129.f;
+    softmax.apply_exp(p_max);
+  } else {
+    // Compute the max.
+    float p_max[Softmax::ROWS_PER_THREAD];
+    softmax.template reduce<fmha::Max_>(p_max);
+
+    // Make sure we are done reading shared memory.
+    __syncthreads();
+
+    // Compute the exponential value.
+    softmax.apply_exp(p_max);
+  }
+
+  // Compute the sum.
+  float p_sum[Softmax::ROWS_PER_THREAD];
+  softmax.template reduce<fmha::Sum_>(p_sum);
+
+  // Finalize softmax on the accumulators of P^T.
+  softmax.scale(p_sum);
+
+  // Store the P matrix.
+#if defined(STORE_S)
+  softmax.store(gmem_s);
+#endif
+
+  // Prepare the data for the second BMM.
+  fmha::Fragment_a<Traits_p, fmha::Row> frag_p[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
+  softmax.pack(frag_p);
+
+  // Declare the accumulators for the 1st gemm.
+  fmha::Fragment_accumulator<Traits_o> acc_o[Mma_tile_o::MMAS_M][Mma_tile_o::VALID_MMAS_N];
+  fmha::Clear_accumulator<typename Traits_o::Accumulator_type, Cta_tile_o::WARPS_K>::apply(acc_o);
+
+  // Make sure we have the LDGDEPBAR in place.
+  fmha::ldgdepbar<USE_LDGSTS_Q>();
+
+// Do this part of O = P^T * V^T.
+#pragma unroll
+  for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+    fmha::gemm(acc_o, frag_p[ki], frag_v[ki]);
+  }
+
+  // Allocate the shared memory tile loader for O. We use the same as Q so be careful!!!
+  Smem_tile_o smem_o(&smem_[0], tidx);
+
+// Loop over MMAS_M.
+#pragma unroll
+  for (int ii = 0; ii < Gmem_tile_o::LOOPS; ++ii) {
+    // Swizzle the elements and do the final reduction.
+    smem_o.store(acc_o, ii);
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+
+    // Load from shared memory.
+    uint4 out[Gmem_tile_o::STGS_PER_LOOP];
+    smem_o.load(out);
+
+    // Make sure the data was read from shared memory.
+    if (ii < Gmem_tile_o::LOOPS - 1) {
+      __syncthreads();
+    }
+
+    // Output the values.
+    gmem_o.store(out, ii);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fused_multihead_attention
diff --git a/csrc/fmha_v2/fused_multihead_attention_kernel_2x2.h b/csrc/fmha_v2/fused_multihead_attention_kernel_2x2.h
new file mode 100644
index 0000000000..93ced8f9a3
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_attention_kernel_2x2.h
@@ -0,0 +1,286 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/gemm.h>
+#include <fmha/kernel_traits.h>
+#include <fused_multihead_attention_kernel.h>
+
+namespace fused_multihead_attention {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_traits, typename Params>
+inline __device__ void device_2x2(Params const& params) {
+  // The instruction traits.
+  using Traits_p = typename Kernel_traits::Traits_p;
+  using Traits_o = typename Kernel_traits::Traits_o;
+
+  // The description of the CTA tile for the 1st batched GEMM.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  // The description of the CTA tile for the 2nd batched GEMM.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = typename Traits_p::template Mma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = typename Traits_o::template Mma_tile<Cta_tile_o>;
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
+  // The shared memory tile to swizzle Q.
+  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
+  // The shared memory tile to swizzle K.
+  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
+  // The shared memory tile to swizzle O.
+  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
+
+  // Do we use LDGSTS for Q, K or V?
+  enum { USE_LDGSTS_Q = Kernel_traits::USE_LDGSTS_Q };
+
+  enum { USE_LDGSTS_K = Kernel_traits::USE_LDGSTS_K };
+
+  enum { USE_LDGSTS_V = 0 };
+
+  // Do we use LDGSTS for any of the 3 input matrices.
+  enum { USE_LDGSTS = USE_LDGSTS_Q || USE_LDGSTS_K || USE_LDGSTS_V };
+
+  // Shared memory.
+  extern __shared__ char smem_[];
+
+  // The block index for the batch.
+  int const bidb = blockIdx.y;
+  // The block index for the head.
+  int const bidh = blockIdx.x;
+  // The thread index.
+  int const tidx = threadIdx.x;
+
+  // Block info to determine if we stop here.
+  Single_cta<Kernel_traits::VERSION> const binfo(params, bidb, bidh, 0, tidx);
+  if (binfo.stop_early()) {
+    return;
+  }
+
+  // The structure to hold the mask.
+  fmha::Mask<Traits_p, Cta_tile_p, Kernel_traits::MASK_VERSION> mask(params, binfo, tidx);
+
+  // Allocate the global memory tile loader for Q.
+  Gmem_tile_q gmem_q(params, 0, binfo, tidx);
+  // Allocate the shared memory tile loader for Q.
+  Smem_tile_q smem_q(&smem_[0], tidx);
+
+  // Allocate the global memory tile loader for K.
+  Gmem_tile_k gmem_k(params, 1, binfo, tidx);
+  // Allocate the shared memory tile loader for K.
+  Smem_tile_k smem_k(&smem_[Smem_tile_q::BYTES_PER_TILE], tidx);
+
+  // Allocate the global memory tile loader for O.
+  Gmem_tile_o gmem_o(params, binfo, tidx);
+  // Allocate the shared memory tile loader for O. We use the same as K so be careful!!!
+  Smem_tile_o smem_o(&smem_[0], tidx);
+
+  // Trigger the loads for Q.
+  gmem_q.load(smem_q);
+  // Trigger the loads for K.
+  gmem_k.load(smem_k);
+
+  // Push the LDGDEPBAR instruction after the loads for Q, K and V.
+  fmha::ldgdepbar<USE_LDGSTS>();
+
+  // Commit the data for Q and K to shared memory.
+  gmem_q.commit(smem_q);
+  gmem_k.commit(smem_k);
+
+  // Make sure the data is in shared memory.
+  fmha::depbar<USE_LDGSTS, 1>();
+  __syncthreads();
+
+  // Load the fragments for Q.
+  typename Smem_tile_q::Fragment frag_q[2][Mma_tile_p::MMAS_M];
+  smem_q.load(frag_q[0], 0);
+
+  // Load the fragments for K.
+  typename Smem_tile_k::Fragment frag_k[2][Mma_tile_p::MMAS_N];
+  smem_k.load(frag_k[0], 0);
+
+  // Declare the accumulators for the 1st gemm.
+  fmha::Fragment_accumulator<Traits_p> acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
+  fmha::Clear_accumulator<typename Traits_p::Accumulator_type, Cta_tile_p::WARPS_K>::apply(acc_p);
+
+// Do this part of P^T = (Q * K^T)^T.
+#pragma unroll
+  for (int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki) {
+    // Trigger the load from shared memory for the next series of Q/K values.
+    smem_q.load(frag_q[ki & 1], ki);
+    smem_k.load(frag_k[ki & 1], ki);
+
+    // Do the math for the values already in registers.
+    if (ki <= Mma_tile_p::VALID_MMAS_K) {
+      fmha::gemm(acc_p, frag_q[(ki - 1) & 1], frag_k[(ki - 1) & 1]);
+    }
+  }
+
+  // Do the final stage of math.
+  if (Mma_tile_p::MMAS_K <= Mma_tile_p::VALID_MMAS_K) {
+    int ki = Mma_tile_p::MMAS_K;
+    fmha::gemm(acc_p, frag_q[(ki - 1) & 1], frag_k[(ki - 1) & 1]);
+  }
+
+  // Store the P matrix.
+#if defined(STORE_P)
+  enum { BITS_PER_ELT_P = sizeof(typename Traits_p::Accumulator_type) * 8 };
+
+  using Gmem_tile_p = fmha::Gmem_tile_ps<Traits_p, Cta_tile_p, BITS_PER_ELT_P>;
+  Gmem_tile_p gmem_p(params.p_ptr, params.p_stride_in_bytes, params.scale_bmm1, tidx);
+  gmem_p.store(acc_p);
+#endif
+
+  // Make sure the shared memory was consumed.
+  __syncthreads();
+
+  // Allocate the global memory tile loader for V.
+  Gmem_tile_v gmem_v(params, 2, binfo, tidx);
+  // Allocate the shared memory tile loader for V. We use the same as K so be careful!!!
+  Smem_tile_v smem_v(smem_, tidx);
+  // Trigger the loads for V.
+  gmem_v.load(smem_v);
+
+  // Load the mask.
+  mask.load(0);
+
+  // Create the object to do the softmax.
+  using Softmax = fmha::Softmax<Traits_p, Cta_tile_p, Kernel_traits>;
+  Softmax softmax(params, &smem_[0], bidb, tidx);
+
+  // Convert from the accumulator type to FP32 for Softmax.
+  softmax.unpack(acc_p);
+
+  // Apply the mask.
+  if (params.has_alibi) {
+    softmax.apply_mask_alibi(mask, bidh, params.alibi_params);
+  } else {
+    softmax.apply_mask(mask);
+  }
+
+  // Enable our trick to use the max for INT8 to scale.
+  if (params.use_int8_scale_max) {
+    // 16129 == 127 ^ 2.
+    float p_max = reinterpret_cast<float const&>(params.scale_bmm1) * 16129.f;
+    softmax.apply_exp(p_max);
+  } else {
+    float p_max[Softmax::ROWS_PER_THREAD];
+    softmax.template reduce<fmha::Max_>(p_max);
+
+    // Compute the exponential value.
+    softmax.apply_exp(p_max);
+  }
+
+  // Compute the sum.
+  float p_sum[Softmax::ROWS_PER_THREAD];
+  softmax.template reduce<fmha::Sum_>(p_sum);
+
+  // Commit the data to shared memory for V. It must happen after the last "reduce".
+  gmem_v.commit(smem_v);
+
+  // Make sure the data for V is in shared memory.
+  __syncthreads();
+
+  // Finalize softmax on the accumulators of P^T.
+  softmax.scale(p_sum);
+
+  // Store the P matrix.
+#if defined(STORE_S)
+  enum { BITS_PER_ELT_S = sizeof(typename Traits_p::A_type) * 8 };
+
+  using Gmem_tile_s = fmha::Gmem_tile_ps<Traits_p, Cta_tile_p, BITS_PER_ELT_S>;
+  Gmem_tile_s gmem_s(params.s_ptr, params.s_stride_in_bytes, params.scale_softmax, tidx);
+  softmax.store(gmem_s);
+#endif
+
+  // Repack the transformed P elements to fragments for the next GEMM.
+  fmha::Fragment_a<Traits_p, fmha::Row> frag_p[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
+  softmax.pack(frag_p);
+
+  // Declare the accumulators for the 1st gemm.
+  fmha::Fragment_accumulator<Traits_o> acc_o[Mma_tile_o::MMAS_M][Mma_tile_o::VALID_MMAS_N];
+  fmha::Clear_accumulator<typename Traits_o::Accumulator_type, Cta_tile_o::WARPS_K>::apply(acc_o);
+
+  // Load the fragments for K. We keep the data in registers during the entire kernel.
+  typename Smem_tile_v::Fragment frag_v[2][Mma_tile_o::VALID_MMAS_N];
+  smem_v.load(frag_v[0], 0);
+
+// Do this part of O = P^T * V^T.
+#pragma unroll
+  for (int ki = 1; ki < Mma_tile_o::MMAS_K; ++ki) {
+    // Trigger the load from shared memory for the next series of Q/K values.
+    smem_v.load(frag_v[ki & 1], ki);
+    // Do the math.
+    fmha::gemm(acc_o, frag_p[(ki - 1)], frag_v[(ki - 1) & 1]);
+  }
+
+  // Do the final stage of math.
+  {
+    int ki = Mma_tile_o::MMAS_K;
+    fmha::gemm(acc_o, frag_p[(ki - 1)], frag_v[(ki - 1) & 1]);
+  }
+
+// // DEBUG.
+// printf("tidx=%3d acc_o[0][0]=0x%08x\n", tidx, acc_o[0][0].reg(0));
+// printf("tidx=%3d acc_o[1][0]=0x%08x\n", tidx, acc_o[1][0].reg(0));
+// printf("tidx=%3d acc_o[2][0]=0x%08x\n", tidx, acc_o[2][0].reg(0));
+// printf("tidx=%3d acc_o[3][0]=0x%08x\n", tidx, acc_o[3][0].reg(0));
+// // END OF DEBUG.
+
+// Loop over MMAS_M.
+#pragma unroll
+  for (int ii = 0; ii < Gmem_tile_o::LOOPS; ++ii) {
+    // Make sure the data was read from shared memory.
+    __syncthreads();
+
+    // Swizzle the elements and do the final reduction.
+    smem_o.store(acc_o, ii);
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+
+    // Load from shared memory.
+    static_assert(Gmem_tile_o::STGS_PER_LOOP == Smem_tile_o::LDS_PER_LOOP, "");
+    uint4 out[Gmem_tile_o::STGS_PER_LOOP];
+    smem_o.load(out);
+
+    // // DEBUG.
+    // #pragma unroll
+    // for( int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; ++jj ) {
+    //     printf("tidx=%3d loop=%d out[%d]=0x%08x\n", tidx, ii, jj, out[jj].x);
+    // }
+    // // END OF DEBUG.
+
+    // Output the values.
+    gmem_o.store(out, ii);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fused_multihead_attention
diff --git a/csrc/fmha_v2/fused_multihead_attention_kernel_4x1_hopper.h b/csrc/fmha_v2/fused_multihead_attention_kernel_4x1_hopper.h
new file mode 100644
index 0000000000..b1464a3388
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_attention_kernel_4x1_hopper.h
@@ -0,0 +1,742 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/gemm.h>
+#include <fmha/hopper/arrive_wait.h>
+#include <fmha/hopper/kernel_traits.h>
+#include <fmha/hopper/utils_warpgroup.h>
+#include <fused_multihead_attention_kernel.h>
+
+namespace fused_multihead_attention {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_traits, typename Params>
+inline __device__ void device_4x1_hopper(Params const& params) {
+  // The instruction traits for P.
+  using Traits_p = typename Kernel_traits::Traits_p;
+  // The instruction traits for O.
+  using Traits_o = typename Kernel_traits::Traits_o;
+
+  // The description of the CTA tile for the 1st batched GEMM.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  // The description of the CTA tile for the 2nd batched GEMM.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = typename Traits_p::template Mma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = typename Traits_o::template Mma_tile<Cta_tile_o>;
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
+  // The shared memory tile to swizzle Q.
+  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
+  // The shared memory tile to swizzle K.
+  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
+
+  // The compute tile for P.
+  using Compute_tile_p = typename Kernel_traits::Compute_tile_p;
+
+  // The compute tile for o.
+  using Compute_tile_o = typename Kernel_traits::Compute_tile_o;
+
+  // Do we use LDGSTS for Q, K or V?
+  enum { USE_LDGSTS_Q = Kernel_traits::USE_LDGSTS_Q };
+
+  enum { USE_LDGSTS_K = Kernel_traits::USE_LDGSTS_K };
+
+  enum { USE_LDGSTS_V = Kernel_traits::USE_LDGSTS_V };
+
+  // Do we use LDGSTS for any of the 3 input matrices.
+  enum { USE_LDGSTS = USE_LDGSTS_Q || USE_LDGSTS_K || USE_LDGSTS_V };
+
+  // If either K or V uses LDGSTS, they cannot share a buffer.
+  static_assert(!(USE_LDGSTS_K || USE_LDGSTS_V) || !Kernel_traits::SHARE_SMEM_FOR_K_AND_V, "");
+
+  // Shared memory.
+  extern __shared__ char smem_[];
+
+  char* q_smem_ = &smem_[0];
+  // It is good to make sure the start address of SMEM is 1024B aligned.
+  q_smem_ = fmha::align_1024(q_smem_);
+  char* k_smem_ = &q_smem_[Smem_tile_q::BYTES_PER_TILE];
+  char* v_smem_ = &k_smem_[Smem_tile_k::BYTES_PER_TILE];
+  char* softmax_smem_ = nullptr;  // no smem needed
+
+  // we should make sure that SMEM address is 1024B aligned.
+
+  // The block index for the batch.
+  int const bidb = blockIdx.y;
+  // The block index for the head.
+  int const bidh = blockIdx.x;
+  // The thread index.
+  int const tidx = threadIdx.x;
+
+  Single_cta<Kernel_traits::VERSION> const binfo(params, bidb, bidh, 0, tidx);
+  if (binfo.stop_early()) {
+    return;
+  }
+
+  // Create the object to control the masks.
+  fmha::Mask_hopper<Traits_p, Cta_tile_p, Kernel_traits::MASK_VERSION> mask(params, binfo, tidx);
+
+  // Allocate the global memory tile loader for Q.
+  Gmem_tile_q gmem_q(params, 0, binfo, tidx);
+  // Allocate the shared memory tile loader for Q.
+  Smem_tile_q smem_q(q_smem_, tidx);
+
+  // Allocate the global memory tile loader for K.
+  Gmem_tile_k gmem_k(params, 1, binfo, tidx);
+  // Allocate the shared memory tile loader for K.
+  Smem_tile_k smem_k(k_smem_, tidx);
+
+  // Allocate the global memory tile loader for V.
+  Gmem_tile_v gmem_v(params, 2, binfo, tidx);
+  // Allocate the shared memory tile loader for V.
+  Smem_tile_v smem_v(v_smem_, tidx);
+
+  // Allocate the global memory tile loader for O.
+  Gmem_tile_o gmem_o(params, binfo, tidx);
+
+  // Trigger the loads for V.
+  gmem_v.load(smem_v);
+  // If needed, push the LDGDEPBAR instruction after the loads for V.
+  fmha::ldgdepbar<USE_LDGSTS && Smem_tile_v::TRANSPOSE>();
+
+  // Trigger the loads for Q at 0th STEP.
+  gmem_q.load(smem_q);
+
+  // Trigger the loads for K.
+  gmem_k.load(smem_k);
+
+  // Push the LDGDEPBAR instruction after the loads for Q and K.
+  fmha::ldgdepbar<USE_LDGSTS>();
+
+  // Commit the data for Q and K to shared memory.
+  // Let's not commit as we always use LDGSTS/TMA for Hopper.
+  // gmem_q.commit(smem_q);
+  // gmem_k.commit(smem_k);
+
+  smem_q.move_next_write_buffer();
+  gmem_q.move();
+  // Trigger the loads for Q at 1th STEP
+  gmem_q.load(smem_q);
+  // Push the LDGDEPBAR instruction after the next load of Q.
+  fmha::ldgdepbar<USE_LDGSTS>();
+
+  if (Smem_tile_v::TRANSPOSE) {
+    // Wait for V to be available in SMEM: up to two ldgsts can be outstanding (q0+k, q1 above)
+    fmha::depbar_<USE_LDGSTS, 2>();
+    __syncthreads();
+    // For 8-bit data types we have to transpose V in SMEM to be in Column-major.
+    smem_v.transpose_tile(tidx);
+    // Fence to guarantee ordering between STSM and GMMA.
+    fmha::fence_view_async_shared();
+    // Not needed as we call it for BMM1.
+    // fmha::warpgroup_arrive();
+  }
+
+  // Store/load P to/from memory (for debugging).
+#if defined(STORE_P)
+  enum { BITS_PER_ELT_P = sizeof(typename Traits_p::Accumulator_type) * 8 };
+
+  using Gmem_tile_p = fmha::Gmem_tile_ps_hopper<Traits_p, Cta_tile_p, BITS_PER_ELT_P>;
+  Gmem_tile_p gmem_p(params.p_ptr, params.p_stride_in_bytes, params.scale_bmm1, tidx);
+#endif
+
+  // Store S to memory (for debugging). NOTE: We use A_type as C_type is int32 for IMMA???
+#if defined(STORE_S)
+  enum { BITS_PER_ELT_S = sizeof(typename Traits_p::A_type) * 8 };
+
+  using Gmem_tile_s = fmha::Gmem_tile_ps_hopper<Traits_p, Cta_tile_p, BITS_PER_ELT_S>;
+  Gmem_tile_s gmem_s(params.s_ptr, params.s_stride_in_bytes, params.scale_softmax, tidx);
+#endif
+
+  // Create the object to do the softmax.
+  using Softmax = fmha::Softmax<Traits_p, Cta_tile_p, Kernel_traits>;
+  // softmax for hopper should not require SMEM. Maybe pass a nullptr.
+  Softmax softmax(params, softmax_smem_, bidb, tidx);
+
+// The number of loops should be the number of STEPS
+#pragma unroll 1
+  for (int loop = 0, outer = 0; loop < Cta_tile_p::N; loop += Cta_tile_p::M, outer++) {
+    // Make sure the data is in shared memory for this loop.
+    fmha::depbar<USE_LDGSTS_Q, 3>();
+    __syncthreads();  // At this point, only one LDGSTS outstanding (N-2!)
+    // GEMM 0.
+
+    // Let's try to use compute_tile for now.
+    // Need to think about refactoring into gemm class [Timmy]
+
+    // q_smem_ address should be updated per STEP.
+    // Kind of a hack for now as it assumes 2 buffers for Q.
+    char* q_smem_per_step = q_smem_ + (outer % 2) * Smem_tile_q::BYTES_PER_BUFFER;
+    // compute_tile for P. ( should take care of the 64x512x64 tile. )
+    Compute_tile_p compute_tile_p(q_smem_per_step, k_smem_);
+    compute_tile_p.clear();
+
+    // static_assert(Compute_tile_p::MMAS_N == 1);
+
+    // for now let's not pipeline GMMA yet.
+    // promise to compiler that data are ready in SMEM
+    fmha::warpgroup_arrive();
+#pragma unroll
+    for (int mmas_k_idx = 0; mmas_k_idx < Mma_tile_p::MMAS_K - 1; ++mmas_k_idx) {
+      compute_tile_p.compute(mmas_k_idx);
+    }
+    // Last GMMA increments score board.
+    compute_tile_p.compute(Mma_tile_p::MMAS_K - 1, true, true);
+    // All preceding GMMAs are finished.
+
+    // Load the mask for that iteration.
+    mask.load(outer);
+
+    fmha::warpgroup_wait<0>();
+
+    // we double buffer for Q.
+    // the SMEM consumed by current loop is issued by ldgsts 2 loops ahead.
+    if (loop + Cta_tile_p::M * 2 < Cta_tile_p::N) {
+      // if there exist a next loop.
+      smem_q.move_next_write_buffer();
+      gmem_q.move();
+      gmem_q.load(smem_q);
+    }
+    // Make sure we have the LDGDEPBAR in place.
+    fmha::ldgdepbar<USE_LDGSTS_Q>();
+
+    // Softmax.
+    // Store the P matrix.
+#if defined(STORE_P)
+    gmem_p.store(compute_tile_p.acc_);
+    gmem_p.move();
+#endif
+
+    // Convert from the accumulator type to FP32 for Softmax.
+    // Note that alpha is also applied here.
+    softmax.unpack(compute_tile_p.acc_);
+
+    // Apply the mask.
+    if (params.has_alibi) {
+      softmax.apply_mask_alibi(mask, bidh, params.alibi_params);
+    } else {
+      softmax.apply_mask(mask);
+    }
+
+    // Make sure we are done reading the data.
+    // For Hopper, most likely it is not shared.
+    if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V && loop == 0) {
+      __syncthreads();
+    }
+    float p_max[Softmax::ROWS_PER_THREAD * Softmax::MMAS_M];
+    // Enable our trick to use the max for INT8 to scale.
+    if (Kernel_traits::USE_SCALE_MAX) {
+      // 16129 == 127 ^ 2.
+      // float p_max = reinterpret_cast<const float&>(params.scale_bmm1) * 16129.f;
+      // softmax.apply_exp(p_max);
+    } else {
+      // Compute the max.
+      softmax.template reduce<fmha::Max_>(p_max);
+
+      if (Cta_tile_p::WARPS_N > 1) {
+        // Inter warp reduction needed.
+        __syncthreads();
+      }
+      // Compute the exponential value.
+      softmax.apply_exp(p_max);
+    }
+
+    // Compute the sum.
+    float p_sum[Softmax::ROWS_PER_THREAD * Softmax::MMAS_M];
+    softmax.template reduce<fmha::Sum_>(p_sum);
+
+    // Finalize softmax on the accumulators of P^T.
+    softmax.scale(p_sum);
+
+    // Store the P matrix.
+#if defined(STORE_S)
+    softmax.store(gmem_s);
+    gmem_s.move();
+#endif
+
+    // GEMM 1.
+
+    // compute_tile for o. ( should take care of the 64x64xS tile. )
+    Compute_tile_o compute_tile_o(nullptr, v_smem_);
+    compute_tile_o.clear();
+
+    // Repack for the next BMM.
+    using Frag_a = fmha::Fragment_a<Traits_o, fmha::Row>;
+    Frag_a frag_s[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
+
+    constexpr int NUM_KGROUPS = Smem_tile_v::BUFFERS_PER_TILE;
+    constexpr int MMAS_K_PER_GROUP = Mma_tile_o::MMAS_K / NUM_KGROUPS;
+    static_assert(MMAS_K_PER_GROUP * NUM_KGROUPS == Mma_tile_o::MMAS_K);
+
+    static_assert(Mma_tile_o::MMAS_M == 1);
+
+    // Fill frag_s with the results from softmax
+    softmax.pack(frag_s);
+
+    // for now let's not pipeline GMMA yet.
+    // promise to compiler that data are ready in SMEM
+    fmha::warpgroup_arrive();
+
+#pragma unroll
+    for (int kbi = 0; kbi < NUM_KGROUPS - 1; kbi++) {
+#pragma unroll
+      for (int ki = 0; ki < MMAS_K_PER_GROUP; ki++) {
+        compute_tile_o.fill_frag_a(frag_s[kbi * MMAS_K_PER_GROUP + ki][0]);
+        // Never increment scoreboard, but check for last kblock.
+        compute_tile_o.compute(ki, false, ki == MMAS_K_PER_GROUP - 1);
+      }
+      compute_tile_o.increment_gmma_desc_group();
+    }
+
+#pragma unroll
+    for (int ki = 0; ki < MMAS_K_PER_GROUP - 1; ki++) {
+      compute_tile_o.fill_frag_a(frag_s[(NUM_KGROUPS - 1) * MMAS_K_PER_GROUP + ki][0]);
+      compute_tile_o.compute(ki);
+    }
+
+    compute_tile_o.fill_frag_a(frag_s[NUM_KGROUPS * MMAS_K_PER_GROUP - 1][0]);
+    compute_tile_o.compute(NUM_KGROUPS * MMAS_K_PER_GROUP - 1, true, true);
+    // all preceding GMMAs are finished.
+    fmha::warpgroup_wait<0>();
+
+#ifdef DEBUG_HAS_PRINT_BUFFER
+    using Acc = fmha::Fragment_accumulator<Traits_o>;
+    float* ptr = reinterpret_cast<float*>(params.print_ptr);
+    float z = compute_tile_p.acc_[0][0].elt(0);
+    int8_t* a_ = reinterpret_cast<int8_t*>(params.qkv_ptr);
+    int8_t* b_ = reinterpret_cast<int8_t*>(params.qkv_ptr) + 64;
+    if (outer == 0 && tidx == 0) {
+      int8_t x_ = a_[0];
+      int8_t y_ = b_[0];
+      float x(x_);
+      float y(y_);
+      ptr[tidx + 0] = p_sum[0];
+      ptr[tidx + 1] = y;
+      ptr[tidx + 2] = z;
+      ptr[tidx + 3] = 123.f;
+    }
+
+#endif
+
+    // store O matrix.
+    gmem_o.store(compute_tile_o.acc_);
+    gmem_o.move();
+
+    if (params.softmax_stats_ptr != nullptr) {
+      using Mma_tile = typename Traits_p::template Mma_tile<Cta_tile_o>;
+      fmha::Softmax_saver<Cta_tile_o, Mma_tile> saver(params, binfo);
+      // float scale_bmm1 = Kernel_traits::USE_SCALE_MAX ? reinterpret_cast<const
+      // float&>(params.scale_bmm1) : 0.0;//TODO
+      saver.store(outer, p_sum, p_max);
+    }
+
+  }  // for loop
+}  // kernel
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_traits, typename Params>
+inline __device__ void device_4x1_hopper_tma(Params const& params) {
+  // The instruction traits for P.
+  using Traits_p = typename Kernel_traits::Traits_p;
+  // The instruction traits for O.
+  using Traits_o = typename Kernel_traits::Traits_o;
+
+  // The description of the CTA tile for the 1st batched GEMM.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  // The description of the CTA tile for the 2nd batched GEMM.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = typename Traits_p::template Mma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = typename Traits_o::template Mma_tile<Cta_tile_o>;
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
+  // The shared memory tile to swizzle Q.
+  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
+  // The shared memory tile to swizzle K.
+  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
+  // The shared memory tile to swizzle O.
+  // using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
+
+  // The compute tile for P.
+  using Compute_tile_p = typename Kernel_traits::Compute_tile_p;
+
+  // The compute tile for o.
+  using Compute_tile_o = typename Kernel_traits::Compute_tile_o;
+
+  // Do we use LDGSTS for Q, K or V?
+  // If not, it is loaded using TMA.
+  enum { USE_LDGSTS_Q = Kernel_traits::USE_LDGSTS_Q };
+
+  enum { USE_LDGSTS_K = Kernel_traits::USE_LDGSTS_K };
+
+  enum { USE_LDGSTS_V = Kernel_traits::USE_LDGSTS_V };
+
+  // Do we use LDGSTS for any of the 3 input matrices.
+  enum { USE_LDGSTS = USE_LDGSTS_Q || USE_LDGSTS_K || USE_LDGSTS_V };
+
+  // If either K or V uses LDGSTS, they cannot share a buffer.
+  static_assert(!(USE_LDGSTS_K || USE_LDGSTS_V) || !Kernel_traits::SHARE_SMEM_FOR_K_AND_V, "");
+
+  // we should make sure that SMEM address is 1024B aligned.
+
+  // The block index for the batch.
+  int const bidb = blockIdx.y;
+  // The block index for the head.
+  int const bidh = blockIdx.x;
+  // The block index.
+  // const int bidx = bidb * gridDim.x + bidh;
+  // The thread index.
+  int const tidx = threadIdx.x;
+
+  // predicate is thread_zero
+  uint32_t is_thread_zero = tidx == 0 ? 1 : 0;
+
+  Single_cta<Kernel_traits::VERSION> const binfo(params, bidb, bidh, 0, tidx);
+  if (binfo.stop_early()) {
+    return;
+  }
+
+  // Shared memory.
+  extern __shared__ char smem_[];
+
+  char* q_smem_ = &smem_[0];
+  // It is good to make sure the start address of SMEM is 1024B aligned.
+  q_smem_ = fmha::align_1024(q_smem_);
+  char* k_smem_ = &q_smem_[Smem_tile_q::BYTES_PER_TILE];
+  char* v_smem_ = &k_smem_[Smem_tile_k::BYTES_PER_TILE];
+
+  // smem barrie pointers.
+  char* q_smem_barrier_ = &v_smem_[Smem_tile_v::BYTES_PER_TILE];
+  // K and V share the same smem_barrier.
+  char* kv_smem_barrier_ = &q_smem_barrier_[Kernel_traits::BYTES_FOR_SMEM_BARRIER_Q];
+  // char *v_smem_barrier_ = &k_smem_barrier_[Kernel_traits::BYTES_FOR_SMEM_BARRIER_K];
+
+  // buffer full barriers (signal TMA has finished filling).
+  fmha::Arrive_wait buffer_q_full_barrier(reinterpret_cast<uint64_t*>(q_smem_barrier_));
+  fmha::Arrive_wait buffer_kv_full_barrier(reinterpret_cast<uint64_t*>(kv_smem_barrier_));
+  // init the barriers. Need to refactor into a separate class later.
+  if (threadIdx.x == 0) {
+    // Create buffer_full barriers with 1 arrive count
+    // create buffer_full for q
+    for (int i = 0; i < Kernel_traits::BUFFERS_PER_SMEM_TILE_Q; i++) {
+      fmha::bar_create(reinterpret_cast<uint64_t*>(q_smem_barrier_) + i, 1);
+    }
+    // This is later used by A1TR which register an arrive count for each transaction as 1
+    for (int i = 0; i < Kernel_traits::BUFFERS_PER_SMEM_TILE_K; i++) {
+      fmha::bar_create(reinterpret_cast<uint64_t*>(kv_smem_barrier_) + i, 1);
+    }
+  }
+  __syncthreads();
+  // buffer empty barriers (signal GMMA has finished consuming).
+  // fmha::Arrive_wait buffer_k_empty_barrier(k_smem_barrier_);
+
+  // Expected transaction count initialization for this buffer_full_barrier
+  // It is executed by  1 thread in DMA warpgroup, it serves two purpose:
+  //   1) increase arrive_cnt to be 1  (now arrive_cnt == expected_arrivecnt)
+  //   2) set trans_cnt =  -COPY_BTES.
+  // The barrier is clear when arrive_cnt == expected_cnt and trans_cnt becomes 0.
+  // The trans_cnt will become 0 when all TMAs in this barrier have complete.
+
+  // set arrive transactioncnt for q
+  buffer_q_full_barrier.bar_arrive_set_transactioncnt(0,  // for the 0th barrier
+                                                      Smem_tile_q::BYTES_PER_BUFFER,
+                                                      is_thread_zero);
+  buffer_q_full_barrier.bar_arrive_set_transactioncnt(1,  // for the 1th barrier
+                                                      Smem_tile_q::BYTES_PER_BUFFER,
+                                                      is_thread_zero);
+
+  // set arrive transactioncnt for kv
+  buffer_kv_full_barrier.bar_arrive_set_transactioncnt(
+      0,  // 0 for now since we know there is only 1 arrive barrier for k and v
+      Smem_tile_k::BYTES_PER_BUFFER + Smem_tile_v::BYTES_PER_BUFFER, is_thread_zero);
+
+  // Create the object to control the masks.
+  fmha::Mask_hopper<Traits_p, Cta_tile_p, Kernel_traits::MASK_VERSION> mask(params, binfo, tidx);
+
+  // Allocate the global memory tile loader for Q.
+  Gmem_tile_q gmem_q(params, &params.tma_desc_q, 0, binfo, tidx);
+  // Allocate the shared memory tile loader for Q.
+  Smem_tile_q smem_q(q_smem_, q_smem_barrier_);
+
+  // Allocate the global memory tile loader for K.
+  Gmem_tile_k gmem_k(params, &params.tma_desc_k, 1, binfo, tidx);
+  // Allocate the shared memory tile loader for K.
+  Smem_tile_k smem_k(k_smem_, kv_smem_barrier_);
+
+  // Allocate the global memory tile loader for V.
+  Gmem_tile_v gmem_v(params, &params.tma_desc_v, 2, binfo, tidx);
+  // Allocate the shared memory tile loader for V.
+  Smem_tile_v smem_v(v_smem_, kv_smem_barrier_);
+
+  // Allocate the global memory tile loader for O.
+  Gmem_tile_o gmem_o(params, binfo, tidx);
+
+  // Use lane 0 of first three warps to issue TMA loads.
+  static_assert(Kernel_traits::THREADS > 64);
+  // Trigger the loads for Q at 0th STEP.
+  if (tidx == 0) {
+    // issue TMA
+    gmem_q.load(smem_q);
+  }
+
+  // Trigger the loads for K.
+  if (tidx == 32) {
+    // issue TMA
+    gmem_k.load(smem_k);
+  }
+
+  // Trigger the loads for V.
+  if (tidx == 64) {
+    // issue TMA
+    gmem_v.load(smem_v);
+  }
+  // Push the LDGDEPBAR instruction after the loads for Q, K and V.
+  // fmha::ldgdepbar<USE_LDGSTS>();
+
+  // Commit the data for Q and K to shared memory.
+  // Let's not commit as we always use LDGSTS/TMA for Hopper.
+  // gmem_q.commit(smem_q);
+  // gmem_k.commit(smem_k);
+  if (tidx == 0) {
+    smem_q.move_next_write_buffer();
+    gmem_q.move();
+    // Trigger the loads for Q at 1th STEP
+    gmem_q.load(smem_q);
+  }
+  // Push the LDGDEPBAR instruction after the loads for Q, K and V.
+  // fmha::ldgdepbar<USE_LDGSTS>();
+
+  // Store/load P to/from memory (for debugging).
+#if defined(STORE_P)
+  enum { BITS_PER_ELT_P = sizeof(typename Traits_p::Accumulator_type) * 8 };
+
+  using Gmem_tile_p = fmha::Gmem_tile_ps_hopper<Traits_p, Cta_tile_p, BITS_PER_ELT_P>;
+  Gmem_tile_p gmem_p(params.p_ptr, params.p_stride_in_bytes, params.scale_bmm1, tidx);
+#endif
+
+  // Store S to memory (for debugging). NOTE: We use A_type as C_type is int32 for IMMA???
+#if defined(STORE_S)
+  enum { BITS_PER_ELT_S = sizeof(typename Traits_p::A_type) * 8 };
+
+  using Gmem_tile_s = fmha::Gmem_tile_ps_hopper<Traits_p, Cta_tile_p, BITS_PER_ELT_S>;
+  Gmem_tile_s gmem_s(params.s_ptr, params.s_stride_in_bytes, params.scale_softmax, tidx);
+#endif
+
+#if !defined(DEBUG_BMM1_ONLY)
+  // Create the object to do the softmax.
+  using Softmax = fmha::Softmax<Traits_p, Cta_tile_p, Kernel_traits>;
+  // softmax for hopper should not require SMEM. Maybe pass a nullptr.
+  Softmax softmax(params, &smem_[Smem_tile_q::BYTES_PER_TILE], bidb, tidx);
+#endif
+
+  //
+  unsigned int phase_bit = 0;
+  // make sure TMA for K is finished.
+  buffer_kv_full_barrier.bar_wait(0, phase_bit);
+
+  // make sure TMA for Q is finished.
+  int barrier_idx = 0;
+  buffer_q_full_barrier.bar_wait(barrier_idx, phase_bit);
+  barrier_idx = barrier_idx == (Kernel_traits::BUFFERS_PER_SMEM_TILE_Q - 1) ? 0 : (barrier_idx + 1);
+
+// Load over the entire sequence length.
+// The number of loops should be the number of STEPS
+#pragma unroll 1
+  for (int loop = 0, outer = 0; loop < Cta_tile_p::N; loop += Cta_tile_p::M, outer++) {
+    // Make sure the data is in shared memory for this loop.
+    // fmha::depbar<USE_LDGSTS_Q, 3>();
+    buffer_q_full_barrier.bar_wait(barrier_idx, phase_bit);
+    barrier_idx =
+        barrier_idx == (Kernel_traits::BUFFERS_PER_SMEM_TILE_Q - 1) ? 0 : (barrier_idx + 1);
+    // flip phase_bit if barrier_idx == 0
+    if (barrier_idx == 0 && (loop + Cta_tile_p::M * 2 < Cta_tile_p::N)) {
+      phase_bit = !phase_bit;
+      buffer_q_full_barrier.bar_arrive_set_transactioncnt(0,  // for the 0th barrier
+                                                          Smem_tile_q::BYTES_PER_BUFFER,
+                                                          is_thread_zero);
+      buffer_q_full_barrier.bar_arrive_set_transactioncnt(1,  // for the 1th barrier
+                                                          Smem_tile_q::BYTES_PER_BUFFER,
+                                                          is_thread_zero);
+    }
+    // make sure TMA for Q for this loop is finished.
+
+    //
+    __syncthreads();
+    // GEMM 0.
+
+    // Let's try to use compute_tile for now.
+    // Need to think about refactoring into gemm class [Timmy]
+
+    // q_smem_ address should be updated per STEP.
+    // Kind of a hack for now as it assumes 2 buffers for Q.
+    char* q_smem_per_step = q_smem_ + (outer % 2) * Smem_tile_q::BYTES_PER_BUFFER;
+    // compute_tile for P. ( should take care of the 64x512x64 tile. )
+    Compute_tile_p compute_tile_p(q_smem_per_step, k_smem_);
+    compute_tile_p.clear();
+
+    // for now let's not pipeline GMMA yet.
+    // promise to compiler that data are ready in SMEM
+    fmha::warpgroup_arrive();
+#pragma unroll
+    for (int mmas_k_idx = 0; mmas_k_idx < Mma_tile_p::MMAS_K - 1; ++mmas_k_idx) {
+      compute_tile_p.compute(mmas_k_idx);
+    }
+    compute_tile_p.compute(Mma_tile_p::MMAS_K - 1, true, true);
+    // all preceding GMMAs are finished.
+    fmha::warpgroup_wait<0>();
+    // Store the P matrix.
+#if defined(STORE_P)
+    gmem_p.store(compute_tile_p.acc_);
+    gmem_p.move();
+#endif
+
+    // we double buffer for Q.
+    // the SMEM consumed by current loop is issued by ldgsts/tma 2 loops ahead.
+    if (loop + Cta_tile_p::M * 2 < Cta_tile_p::N) {
+      if (tidx == 0) {
+        // if there exist a next loop.
+        smem_q.move_next_write_buffer();
+        gmem_q.move();
+        // issue TMA
+        gmem_q.load(smem_q);
+      }
+    }
+    // Make sure we have the LDGDEPBAR in place.
+    // fmha::ldgdepbar<USE_LDGSTS_Q>();
+
+    // Softmax.
+
+    // Load the mask for that iteration.
+    // Actually do nothing. Should we just remove it?
+    mask.load(outer);
+
+#if !defined(DEBUG_BMM1_ONLY)
+    // Convert from the accumulator type to FP32 for Softmax.
+    // Note that alpha is also applied here.
+    softmax.unpack(compute_tile_p.acc_);
+
+    // Apply the mask.
+    if (params.has_alibi) {
+      softmax.apply_mask_alibi(mask, bidh, params.alibi_params);
+    } else {
+      softmax.apply_mask(mask);
+    }
+
+    // Make sure we are done reading the data.
+    // For Hopper, most likely it is not shared.
+    if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V && loop == 0) {
+      __syncthreads();
+    }
+    float p_max[Softmax::ROWS_PER_THREAD * Softmax::MMAS_M];
+    // Enable our trick to use the max for INT8 to scale.
+    if (Kernel_traits::USE_SCALE_MAX) {
+      // 16129 == 127 ^ 2.
+      // float p_max = reinterpret_cast<const float&>(params.scale_bmm1) * 16129.f;
+      // softmax.apply_exp(p_max);
+    } else {
+      // Compute the max.
+      softmax.template reduce<fmha::Max_>(p_max);
+
+      // Make sure we are done reading shared memory.
+      // We don't really use SMEM currently in Hopper for softmax reduction.
+      //__syncthreads();
+
+      // Compute the exponential value.
+      softmax.apply_exp(p_max);
+    }
+
+    // Compute the sum.
+    float p_sum[Softmax::ROWS_PER_THREAD * Softmax::MMAS_M];
+    softmax.template reduce<fmha::Sum_>(p_sum);
+
+    // Finalize softmax on the accumulators of P^T.
+    softmax.scale(p_sum);
+
+    // Store the P matrix.
+#if defined(STORE_S)
+    softmax.store(gmem_s);
+    gmem_s.move();
+#endif
+
+    // GEMM 1.
+
+    // compute_tile for o. ( should take care of the 64x64xS tile. )
+    Compute_tile_o compute_tile_o(v_smem_, v_smem_);
+    compute_tile_o.clear();
+
+    // Repack for the next BMM.
+    fmha::Fragment_a<Traits_o, fmha::Row> frag_s[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
+
+    // Fill frag_s with the results from softmax
+    softmax.pack(frag_s);
+
+    // for now let's not pipeline GMMA yet.
+    // promise to compiler that data are ready in SMEM
+    fmha::warpgroup_arrive();
+#pragma unroll
+    for (int mmas_k_idx = 0; mmas_k_idx < Mma_tile_o::MMAS_K - 1; ++mmas_k_idx) {
+      compute_tile_o.fill_frag_a(frag_s[mmas_k_idx][0]);
+      compute_tile_o.compute(mmas_k_idx);
+    }
+    compute_tile_o.fill_frag_a(frag_s[Mma_tile_o::MMAS_K - 1][0]);
+    compute_tile_o.compute(Mma_tile_o::MMAS_K - 1, true, true);
+    // all preceding GMMAs are finished.
+    fmha::warpgroup_wait<0>();
+
+    // store O matrix.
+    gmem_o.store(compute_tile_o.acc_);
+    gmem_o.move();
+#endif
+    if (params.softmax_stats_ptr != nullptr) {
+      using Mma_tile = typename Traits_p::template Mma_tile<Cta_tile_o>;
+      fmha::Softmax_saver<Cta_tile_o, Mma_tile> saver(params, binfo);
+      // float scale_bmm1 = Kernel_traits::USE_SCALE_MAX ? reinterpret_cast<const
+      // float&>(params.scale_bmm1) : 0.0;//TODO
+      saver.store(outer, p_sum, p_max);
+    }
+
+  }  // for loop
+}
+}  // namespace fused_multihead_attention
diff --git a/csrc/fmha_v2/fused_multihead_attention_kernel_4x1_hopper_noloop.h b/csrc/fmha_v2/fused_multihead_attention_kernel_4x1_hopper_noloop.h
new file mode 100644
index 0000000000..dd98b89ab0
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_attention_kernel_4x1_hopper_noloop.h
@@ -0,0 +1,330 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/gemm.h>
+#include <fmha/hopper/arrive_wait.h>
+#include <fmha/hopper/kernel_traits.h>
+#include <fmha/hopper/utils_warpgroup.h>
+#include <fused_multihead_attention_kernel.h>
+
+namespace fused_multihead_attention {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_traits, typename Params>
+inline __device__ void device_4x1_hopper_nl(Params const& params) {
+  // The instruction traits for P.
+  using Traits_p = typename Kernel_traits::Traits_p;
+  // The instruction traits for O.
+  using Traits_o = typename Kernel_traits::Traits_o;
+
+  // The description of the CTA tile for the 1st batched GEMM.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  // The description of the CTA tile for the 2nd batched GEMM.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = typename Traits_p::template Mma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = typename Traits_o::template Mma_tile<Cta_tile_o>;
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
+  // The shared memory tile to swizzle Q.
+  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
+  // The shared memory tile to swizzle K.
+  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
+  // The shared memory tile to swizzle O.
+  // using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
+
+  // The compute tile for P.
+  using Compute_tile_p = typename Kernel_traits::Compute_tile_p;
+
+  // The compute tile for o.
+  using Compute_tile_o = typename Kernel_traits::Compute_tile_o;
+
+  // Do we use LDGSTS for Q, K or V?
+  enum { USE_LDGSTS_Q = Kernel_traits::USE_LDGSTS_Q };
+
+  enum { USE_LDGSTS_K = Kernel_traits::USE_LDGSTS_K };
+
+  enum { USE_LDGSTS_V = Kernel_traits::USE_LDGSTS_V };
+
+  // Do we use LDGSTS for any of the 3 input matrices.
+  enum { USE_LDGSTS = USE_LDGSTS_Q || USE_LDGSTS_K || USE_LDGSTS_V };
+
+  // If either K or V uses LDGSTS, they cannot share a buffer.
+  static_assert(!(USE_LDGSTS_K || USE_LDGSTS_V) || !Kernel_traits::SHARE_SMEM_FOR_K_AND_V, "");
+
+  // Shared memory.
+  extern __shared__ char smem_[];
+
+  char* q_smem_ = &smem_[0];
+  // It is good to make sure the start address of SMEM is 1024B aligned.
+  q_smem_ = fmha::align_1024(q_smem_);
+  char* k_smem_ = &q_smem_[Smem_tile_q::BYTES_PER_TILE];
+  char* v_smem_ = &k_smem_[Smem_tile_k::BYTES_PER_TILE];
+  char* softmax_smem_ = nullptr;  // no smem needed
+
+  // we should make sure that SMEM address is 1024B aligned.
+
+  // The loop -- each CTA works on a different loop iteration.
+  int const loop = blockIdx.z;
+  // The block index for the batch.
+  int const bidb = blockIdx.y;
+  // The block index for the head.
+  int const bidh = blockIdx.x;
+  // The thread index.
+  int const tidx = threadIdx.x;
+
+  Single_cta<Kernel_traits::VERSION> const binfo(params, bidb, bidh, 0, tidx);
+  if (binfo.stop_early(loop)) {
+    return;
+  }
+
+  // Create the object to control the masks.
+  fmha::Mask_hopper<Traits_p, Cta_tile_p, Kernel_traits::MASK_VERSION> mask(params, binfo, tidx);
+
+  // Allocate the global memory tile loader for Q.
+  Gmem_tile_q gmem_q(params, 0, binfo, tidx, loop * Gmem_tile_q::ROWS);
+  // Allocate the shared memory tile loader for Q.
+  Smem_tile_q smem_q(q_smem_, tidx);
+
+  // Allocate the global memory tile loader for K.
+  Gmem_tile_k gmem_k(params, 1, binfo, tidx);
+  // Allocate the shared memory tile loader for K.
+  Smem_tile_k smem_k(k_smem_, tidx);
+
+  // Allocate the global memory tile loader for V.
+  Gmem_tile_v gmem_v(params, 2, binfo, tidx);
+  // Allocate the shared memory tile loader for V.
+  Smem_tile_v smem_v(v_smem_, tidx);
+
+  // Allocate the global memory tile loader for O.
+  Gmem_tile_o gmem_o(params, binfo, tidx, loop * Gmem_tile_o::ROWS);
+
+  // Trigger the loads for V.
+  gmem_v.load(smem_v);
+  // If needed, push the LDGDEPBAR instruction after the loads for V.
+  fmha::ldgdepbar<USE_LDGSTS && Smem_tile_v::TRANSPOSE>();
+
+  // Trigger the loads for Q at 0th STEP.
+  gmem_q.load(smem_q);
+
+  // Trigger the loads for K.
+  gmem_k.load(smem_k);
+
+  // Push the LDGDEPBAR instruction after the loads for Q and K.
+  fmha::ldgdepbar<USE_LDGSTS>();
+
+  if (Smem_tile_v::TRANSPOSE) {
+    // Wait for V to be available in SMEM: up to two ldgsts can be outstanding (q0+k above)
+    fmha::depbar_<USE_LDGSTS, 1>();
+    __syncthreads();
+    // For 8-bit data types we have to transpose V in SMEM to be in Column-major.
+    smem_v.transpose_tile(tidx);
+    // Fence to guarantee ordering between STSM and GMMA.
+    fmha::fence_view_async_shared();
+    // Not needed as we call it for BMM1.
+    // fmha::warpgroup_arrive();
+  }
+
+  // Store/load P to/from memory (for debugging).
+#if defined(STORE_P)
+  enum { BITS_PER_ELT_P = sizeof(typename Traits_p::Accumulator_type) * 8 };
+
+  using Gmem_tile_p = fmha::Gmem_tile_ps_hopper<Traits_p, Cta_tile_p, BITS_PER_ELT_P>;
+  char* p_ptr = reinterpret_cast<char*>(params.p_ptr);
+  p_ptr += loop * Cta_tile_p::M * params.p_stride_in_bytes;
+  Gmem_tile_p gmem_p(p_ptr, params.p_stride_in_bytes, params.scale_bmm1, tidx);
+#endif
+
+  // Store S to memory (for debugging). NOTE: We use A_type as C_type is int32 for IMMA???
+#if defined(STORE_S)
+  enum { BITS_PER_ELT_S = sizeof(typename Traits_p::A_type) * 8 };
+
+  using Gmem_tile_s = fmha::Gmem_tile_ps_hopper<Traits_p, Cta_tile_p, BITS_PER_ELT_S>;
+  char* s_ptr = reinterpret_cast<char*>(params.s_ptr);
+  s_ptr += loop * Cta_tile_p::M * params.s_stride_in_bytes;
+  Gmem_tile_s gmem_s(s_ptr, params.s_stride_in_bytes, params.scale_softmax, tidx);
+#endif
+
+  // Create the object to do the softmax.
+  using Softmax = fmha::Softmax<Traits_p, Cta_tile_p, Kernel_traits>;
+  // softmax for hopper should not require SMEM. Maybe pass a nullptr.
+  Softmax softmax(params, softmax_smem_, bidb, tidx);
+
+  // Make sure the data is in shared memory.
+  fmha::depbar<USE_LDGSTS_Q, 2>();
+  __syncthreads();  // At this point, no LDGSTS outstanding (N-2!)
+  // GEMM 0.
+
+  // Let's try to use compute_tile for now.
+  // Need to think about refactoring into gemm class [Timmy]
+
+  // Compute_tile for P.
+  Compute_tile_p compute_tile_p(q_smem_, k_smem_);
+  compute_tile_p.clear();
+
+  // for now let's not pipeline GMMA yet.
+  // promise to compiler that data are ready in SMEM
+  fmha::warpgroup_arrive();
+#pragma unroll
+  for (int mmas_k_idx = 0; mmas_k_idx < Mma_tile_p::MMAS_K - 1; ++mmas_k_idx) {
+    compute_tile_p.compute(mmas_k_idx);
+  }
+  // Last GMMA increments score board.
+  compute_tile_p.compute(Mma_tile_p::MMAS_K - 1, true, true);
+  // All preceding GMMAs are finished.
+
+  // Load the mask for that iteration.
+  mask.load(loop);
+
+  fmha::warpgroup_wait<0>();
+
+  // Softmax.
+  // Store the P matrix.
+#if defined(STORE_P)
+  gmem_p.store(compute_tile_p.acc_);
+  gmem_p.move();
+#endif
+
+  // Convert from the accumulator type to FP32 for Softmax.
+  // Note that alpha is also applied here.
+  softmax.unpack(compute_tile_p.acc_);
+
+  // Apply the mask.
+  if (params.has_alibi) {
+    softmax.apply_mask_alibi(mask, bidh, params.alibi_params);
+  } else {
+    softmax.apply_mask(mask);
+  }
+
+  // Make sure we are done reading the data.
+  // For Hopper, most likely it is not shared.
+  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V && loop == 0) {
+    __syncthreads();
+  }
+  float p_max[Softmax::ROWS_PER_THREAD * Softmax::MMAS_M];
+  // Enable our trick to use the max for INT8 to scale.
+  if (Kernel_traits::USE_SCALE_MAX) {
+    // 16129 == 127 ^ 2.
+    // float p_max = reinterpret_cast<const float&>(params.scale_bmm1) * 16129.f;
+    // softmax.apply_exp(p_max);
+  } else {
+    // Compute the max.
+    softmax.template reduce<fmha::Max_>(p_max);
+
+    // Make sure we are done reading shared memory.
+    // We don't really use SMEM currently in Hopper for softmax reduction.
+    //__syncthreads();
+
+    // Compute the exponential value.
+    softmax.apply_exp(p_max);
+  }
+
+  // Compute the sum.
+  float p_sum[Softmax::ROWS_PER_THREAD * Softmax::MMAS_M];
+  softmax.template reduce<fmha::Sum_>(p_sum);
+
+  // Finalize softmax on the accumulators of P^T.
+  softmax.scale(p_sum);
+
+  // Store the P matrix.
+#if defined(STORE_S)
+  softmax.store(gmem_s);
+  gmem_s.move();
+#endif
+
+  // GEMM 1.
+
+  // compute_tile for o. ( should take care of the 64x64xS tile. )
+  Compute_tile_o compute_tile_o(v_smem_, v_smem_);
+  compute_tile_o.clear();
+
+  // Repack for the next BMM.
+  fmha::Fragment_a<Traits_o, fmha::Row> frag_s[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
+  constexpr int NUM_KGROUPS = Smem_tile_v::BUFFERS_PER_TILE;
+  constexpr int MMAS_K_PER_GROUP = Mma_tile_o::MMAS_K / NUM_KGROUPS;
+  static_assert(MMAS_K_PER_GROUP * NUM_KGROUPS == Mma_tile_o::MMAS_K);
+
+  // Fill frag_s with the results from softmax
+  softmax.pack(frag_s);
+
+  // for now let's not pipeline GMMA yet.
+  // promise to compiler that data are ready in SMEM
+  fmha::warpgroup_arrive();
+#pragma unroll
+  for (int kbi = 0; kbi < NUM_KGROUPS - 1; kbi++) {
+#pragma unroll
+    for (int ki = 0; ki < MMAS_K_PER_GROUP; ki++) {
+      compute_tile_o.fill_frag_a(frag_s[kbi * MMAS_K_PER_GROUP + ki][0]);
+      // Never increment scoreboard, but check for last kblock.
+      compute_tile_o.compute(ki, false, ki == MMAS_K_PER_GROUP - 1);
+    }
+    compute_tile_o.increment_gmma_desc_group();
+  }
+
+#pragma unroll
+  for (int ki = 0; ki < MMAS_K_PER_GROUP - 1; ki++) {
+    compute_tile_o.fill_frag_a(frag_s[(NUM_KGROUPS - 1) * MMAS_K_PER_GROUP + ki][0]);
+    compute_tile_o.compute(ki);
+  }
+
+  compute_tile_o.fill_frag_a(frag_s[NUM_KGROUPS * MMAS_K_PER_GROUP - 1][0]);
+  compute_tile_o.compute(NUM_KGROUPS * MMAS_K_PER_GROUP - 1, true, true);
+  // all preceding GMMAs are finished.
+  fmha::warpgroup_wait<0>();
+
+#ifdef DEBUG_HAS_PRINT_BUFFER
+  using Acc = fmha::Fragment_accumulator<Traits_o>;
+
+  // static_assert(Acc::NUM_REGS == 64 * 64 / 128); // 32
+  auto& tmp = compute_tile_o.acc_[0][0];
+
+  float* ptr = reinterpret_cast<float*>(params.print_ptr);
+  if (loop == 1 && tidx < 4 && bidb == 0 && bidh == 0) {
+    float2 reg = fmha::half2_to_float2(tmp.reg(0));
+    float x = tmp.elt(0);
+    float y = tmp.elt(1);
+    ptr[tidx * 2 + 0] = x;
+    ptr[tidx * 2 + 1] = y;
+  }
+#endif
+
+  // store O matrix.
+  gmem_o.store(compute_tile_o.acc_);
+
+  if (params.softmax_stats_ptr != nullptr) {
+    using Mma_tile = typename Traits_p::template Mma_tile<Cta_tile_o>;
+    fmha::Softmax_saver<Cta_tile_o, Mma_tile> saver(params, binfo);
+    // float scale_bmm1 = Kernel_traits::USE_SCALE_MAX ? reinterpret_cast<const
+    // float&>(params.scale_bmm1) : 0.0;//TODO
+    saver.store(loop, p_sum, p_max);
+  }
+
+}  // kernel
+
+}  // namespace fused_multihead_attention
diff --git a/csrc/fmha_v2/fused_multihead_attention_kernel_4xN_hopper.h b/csrc/fmha_v2/fused_multihead_attention_kernel_4xN_hopper.h
new file mode 100644
index 0000000000..2487262f2d
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_attention_kernel_4xN_hopper.h
@@ -0,0 +1,371 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/gemm.h>
+#include <fmha/hopper/arrive_wait.h>
+#include <fmha/hopper/kernel_traits.h>
+#include <fmha/hopper/utils_warpgroup.h>
+#include <fused_multihead_attention_kernel.h>
+
+namespace fused_multihead_attention {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_traits, typename Params>
+inline __device__ void device_4xN_hopper(Params const& params) {
+  // The instruction traits for P.
+  using Traits_p = typename Kernel_traits::Traits_p;
+  // The instruction traits for O.
+  using Traits_o = typename Kernel_traits::Traits_o;
+
+  // The description of the CTA tile for the 1st batched GEMM.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  // The description of the CTA tile for the 2nd batched GEMM.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = typename Traits_p::template Mma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = typename Traits_o::template Mma_tile<Cta_tile_o>;
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
+  // The shared memory tile to swizzle Q.
+  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
+  // The shared memory tile to swizzle K.
+  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
+  // The shared memory tile to swizzle O.
+  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
+
+  // The compute tile for P.
+  using Compute_tile_p = typename Kernel_traits::Compute_tile_p;
+
+  // The compute tile for o.
+  using Compute_tile_o = typename Kernel_traits::Compute_tile_o;
+
+  // Do we use LDGSTS for Q, K or V?
+  enum { USE_LDGSTS_Q = Kernel_traits::USE_LDGSTS_Q };
+
+  enum { USE_LDGSTS_K = Kernel_traits::USE_LDGSTS_K };
+
+  enum { USE_LDGSTS_V = Kernel_traits::USE_LDGSTS_V };
+
+  // Do we use LDGSTS for any of the 3 input matrices.
+  enum { USE_LDGSTS = USE_LDGSTS_Q || USE_LDGSTS_K || USE_LDGSTS_V };
+
+  // If either K or V uses LDGSTS, they cannot share a buffer.
+  static_assert(!(USE_LDGSTS_K || USE_LDGSTS_V) || !Kernel_traits::SHARE_SMEM_FOR_K_AND_V, "");
+
+  // Shared memory.
+  extern __shared__ char smem_[];
+
+  char* q_smem_ = &smem_[0];
+  // It is good to make sure the start address of SMEM is 1024B aligned.
+  q_smem_ = fmha::align_1024(q_smem_);
+  char* k_smem_ = &q_smem_[Smem_tile_q::BYTES_PER_TILE];
+  char* v_smem_ = &k_smem_[Smem_tile_k::BYTES_PER_TILE];
+  char* o_smem_ = &v_smem_[Smem_tile_v::BYTES_PER_TILE];
+  char* softmax_smem_ = &o_smem_[Smem_tile_o::BYTES_PER_TILE];
+
+  // we should make sure that SMEM address is 1024B aligned.
+
+  // The block index for the batch.
+  int const bidb = blockIdx.y;
+  // The block index for the head.
+  int const bidh = blockIdx.x;
+  // The block index.
+  // const int bidx = bidb * gridDim.x + bidh;
+  // The thread index.
+  int const tidx = threadIdx.x;
+
+  Single_cta<Kernel_traits::VERSION> const binfo(params, bidb, bidh, 0, tidx);
+  if (binfo.stop_early()) {
+    return;
+  }
+
+  // Create the object to control the masks.
+  fmha::Mask_hopper<Traits_p, Cta_tile_p, Kernel_traits::MASK_VERSION> mask(params, binfo, tidx);
+
+  // Allocate the global memory tile loader for Q.
+  Gmem_tile_q gmem_q(params, 0, binfo, tidx);
+  // Allocate the shared memory tile loader for Q.
+  Smem_tile_q smem_q(q_smem_, tidx);
+
+  // Allocate the global memory tile loader for K.
+  Gmem_tile_k gmem_k(params, 1, binfo, tidx);
+  // Allocate the shared memory tile loader for K.
+  Smem_tile_k smem_k(k_smem_, tidx);
+
+  // Allocate the global memory tile loader for V.
+  Gmem_tile_v gmem_v(params, 2, binfo, tidx);
+  // Allocate the shared memory tile loader for V.
+  Smem_tile_v smem_v(v_smem_, tidx);
+
+  // Allocate the global memory tile loader for O.
+  Gmem_tile_o gmem_o(params, binfo, tidx);
+
+  Smem_tile_o smem_o(o_smem_, tidx);
+
+  // Trigger the loads for V.
+  gmem_v.load(smem_v);
+  // If needed, push the LDGDEPBAR instruction after the loads for V.
+  fmha::ldgdepbar<USE_LDGSTS && Smem_tile_v::TRANSPOSE>();
+
+  // Trigger the loads for Q at 0th STEP.
+  gmem_q.load(smem_q);
+
+  // Trigger the loads for K.
+  gmem_k.load(smem_k);
+
+  // Push the LDGDEPBAR instruction after the loads for Q and K.
+  fmha::ldgdepbar<USE_LDGSTS>();
+
+  // Commit the data for Q and K to shared memory.
+  // Let's not commit as we always use LDGSTS/TMA for Hopper.
+  // gmem_q.commit(smem_q);
+  // gmem_k.commit(smem_k);
+
+  smem_q.move_next_write_buffer();
+  gmem_q.move();
+  // Trigger the loads for Q at 1th STEP
+  gmem_q.load(smem_q);
+  // Push the LDGDEPBAR instruction after the next load of Q.
+  fmha::ldgdepbar<USE_LDGSTS>();
+
+  if (Smem_tile_v::TRANSPOSE) {
+    // Wait for V to be available in SMEM: up to two ldgsts can be outstanding (q0+k, q1 above)
+    fmha::depbar_<USE_LDGSTS, 2>();
+    __syncthreads();
+    // For 8-bit data types we have to transpose V in SMEM to be in Column-major.
+    smem_v.transpose_tile(tidx);
+    // Fence to guarantee ordering between STSM and GMMA.
+    fmha::fence_view_async_shared();
+    // Not needed as we call it for BMM1.
+    // fmha::warpgroup_arrive();
+  }
+
+  // Store/load P to/from memory (for debugging).
+#if defined(STORE_P)
+  enum { BITS_PER_ELT_P = sizeof(typename Traits_p::Accumulator_type) * 8 };
+
+  using Gmem_tile_p = fmha::Gmem_tile_ps_hopper<Traits_p, Cta_tile_p, BITS_PER_ELT_P>;
+  Gmem_tile_p gmem_p(params.p_ptr, params.p_stride_in_bytes, params.scale_bmm1, tidx);
+#endif
+
+  // Store S to memory (for debugging). NOTE: We use A_type as C_type is int32 for IMMA???
+#if defined(STORE_S)
+  enum { BITS_PER_ELT_S = sizeof(typename Traits_p::A_type) * 8 };
+
+  using Gmem_tile_s = fmha::Gmem_tile_ps_hopper<Traits_p, Cta_tile_p, BITS_PER_ELT_S>;
+  Gmem_tile_s gmem_s(params.s_ptr, params.s_stride_in_bytes, params.scale_softmax, tidx);
+#endif
+
+  // Create the object to do the softmax.
+  using Softmax = fmha::Softmax<Traits_p, Cta_tile_p, Kernel_traits>;
+  // softmax for hopper should not require SMEM. Maybe pass a nullptr.
+  Softmax softmax(params, softmax_smem_, bidb, tidx);
+
+// Load over the entire sequence length.
+// The number of loops should be the number of STEPS
+#pragma unroll 1
+  for (int loop = 0, outer = 0; loop < Cta_tile_p::N; loop += Cta_tile_p::M, outer++) {
+    // Make sure the data is in shared memory for this loop.
+    fmha::depbar<USE_LDGSTS_Q, 3>();
+    __syncthreads();  // At this point, only one LDGSTS outstanding (N-2!)
+    // GEMM 0.
+
+    // Let's try to use compute_tile for now.
+    // Need to think about refactoring into gemm class [Timmy]
+
+    // q_smem_ address should be updated per STEP.
+    // Kind of a hack for now as it assumes 2 buffers for Q.
+    char* q_smem_per_step = q_smem_ + (outer % 2) * Smem_tile_q::BYTES_PER_BUFFER;
+    // compute_tile for P. ( should take care of the 64x512x64 tile. )
+    int warp = tidx / 32;
+    int warp_n = warp / 4;
+    // static_assert(Traits_p::GMMA_N == 192);
+    //  TODO how to set this up? Also GMMA_N % 8 = 0 to line up with XOR?
+    Compute_tile_p compute_tile_p(
+        q_smem_per_step,
+        k_smem_ + warp_n * Cta_tile_p::K * Traits_p::GMMA_N * sizeof(typename Traits_p::B_type));
+    compute_tile_p.clear();
+
+    static_assert(Compute_tile_p::MMAS_N == 1);
+
+    // for now let's not pipeline GMMA yet.
+    // promise to compiler that data are ready in SMEM
+    fmha::warpgroup_arrive();
+#pragma unroll
+    for (int mmas_k_idx = 0; mmas_k_idx < Mma_tile_p::MMAS_K - 1; ++mmas_k_idx) {
+      compute_tile_p.compute(mmas_k_idx);
+    }
+    // Last GMMA increments score board.
+    compute_tile_p.compute(Mma_tile_p::MMAS_K - 1, true, true);
+    // All preceding GMMAs are finished.
+
+    // Load the mask for that iteration.
+    mask.load(outer);
+
+    fmha::warpgroup_wait<0>();
+
+    // we double buffer for Q.
+    // the SMEM consumed by current loop is issued by ldgsts 2 loops ahead.
+    if (loop + Cta_tile_p::M * 2 < Cta_tile_p::N) {
+      // if there exist a next loop.
+      smem_q.move_next_write_buffer();
+      gmem_q.move();
+      gmem_q.load(smem_q);
+    }
+    // Make sure we have the LDGDEPBAR in place.
+    fmha::ldgdepbar<USE_LDGSTS_Q>();
+
+    // Softmax.
+    // Store the P matrix.
+#if defined(STORE_P)
+    gmem_p.store(compute_tile_p.acc_);
+    gmem_p.move();
+#endif
+
+    // Convert from the accumulator type to FP32 for Softmax.
+    // Note that alpha is also applied here.
+    softmax.unpack(compute_tile_p.acc_);
+
+    // Apply the mask.
+    if (params.has_alibi) {
+      softmax.apply_mask_alibi(mask, bidh, params.alibi_params);
+    } else {
+      softmax.apply_mask(mask);
+    }
+
+    // Make sure we are done reading the data.
+    // For Hopper, most likely it is not shared.
+    if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V && loop == 0) {
+      __syncthreads();
+    }
+    float p_max[Softmax::ROWS_PER_THREAD * Softmax::MMAS_M]{0.f};
+    // Enable our trick to use the max for INT8 to scale.
+    if (Kernel_traits::USE_SCALE_MAX) {
+      // 16129 == 127 ^ 2.
+      // float p_max = reinterpret_cast<const float&>(params.scale_bmm1) * 16129.f;
+      // softmax.apply_exp(p_max);
+    } else {
+      // Compute the max.
+      softmax.template reduce<fmha::Max_>(p_max);
+
+      if (Cta_tile_p::WARPS_N > 1) {
+        // Inter warp reduction needed.
+        __syncthreads();
+      }
+      // Compute the exponential value.
+      softmax.apply_exp(p_max);
+    }
+
+    // Compute the sum.
+    float p_sum[Softmax::ROWS_PER_THREAD * Softmax::MMAS_M];
+    softmax.template reduce<fmha::Sum_>(p_sum);
+
+    // Finalize softmax on the accumulators of P^T.
+    softmax.scale(p_sum);
+
+    // Store the P matrix.
+#if defined(STORE_S)
+    softmax.store(gmem_s);
+    gmem_s.move();
+#endif
+
+    // GEMM 1.
+
+    // compute_tile for o. ( should take care of the 64x64xS tile. )
+    Compute_tile_o compute_tile_o(nullptr, v_smem_);
+    compute_tile_o.clear();
+
+    // Repack for the next BMM.
+    using Frag_a = fmha::Fragment_a<Traits_o, fmha::Row>;
+    Frag_a frag_s[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
+
+    static_assert(Frag_a::NUM_ELTS == 16 * Traits_o::GMMA_K / 32);
+    // static_assert(Mma_tile_o::MMAS_K == 6);
+    static_assert(Mma_tile_o::MMAS_M == 1);
+    // static_assert(Compute_tile_o::MMAS_K == 4);
+
+    // Fill frag_s with the results from softmax
+    softmax.pack(frag_s);
+
+    // for now let's not pipeline GMMA yet.
+    // promise to compiler that data are ready in SMEM
+    fmha::warpgroup_arrive();
+
+    compute_tile_o.compute_incta_splitk(frag_s, warp_n);
+    // all preceding GMMAs are finished.
+    fmha::warpgroup_wait<0>();
+
+// Loop over MMAS_M.
+#pragma unroll
+    for (int ii = 0; ii < Gmem_tile_o::LOOPS; ++ii) {
+      // Swizzle the elements and do the final reduction.
+      smem_o.store(compute_tile_o.acc_, ii);
+
+      // Make sure the data is in shared memory.
+      __syncthreads();
+
+      // Load from shared memory.
+      uint4 out[Gmem_tile_o::STGS_PER_LOOP];
+      smem_o.load(out);
+
+      // Make sure the data was read from shared memory.
+      if (ii < Gmem_tile_o::LOOPS - 1) {
+        __syncthreads();
+      }
+
+      // Output the values.
+      gmem_o.store(out, ii);
+    }
+
+    gmem_o.move();
+
+    if (params.softmax_stats_ptr != nullptr) {
+      using Mma_tile = typename Traits_p::template Mma_tile<Cta_tile_o>;
+      fmha::Softmax_saver<Cta_tile_o, Mma_tile> saver(params, binfo);
+      // float scale_bmm1 = Kernel_traits::USE_SCALE_MAX ? reinterpret_cast<const
+      // float&>(params.scale_bmm1) : 0.0;//TODO
+      saver.store(outer, p_sum, p_max);
+    }
+
+#ifdef DEBUG_HAS_PRINT_BUFFER
+    int lane = tidx % 32;
+    using Acc = fmha::Fragment_accumulator<Traits_o>;
+    float* ptr = reinterpret_cast<float*>(params.print_ptr);
+    half* ptr_o = reinterpret_cast<half*>(o_smem_);
+    if (loop == 0 && warp_n < 2 && lane == 0) {
+    }
+    __syncthreads();
+#endif
+
+  }  // for loop
+}  // kernel
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fused_multihead_attention
diff --git a/csrc/fmha_v2/fused_multihead_attention_kernel_4xN_hopper_noloop.h b/csrc/fmha_v2/fused_multihead_attention_kernel_4xN_hopper_noloop.h
new file mode 100644
index 0000000000..d781d338d5
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_attention_kernel_4xN_hopper_noloop.h
@@ -0,0 +1,382 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/gemm.h>
+#include <fmha/hopper/arrive_wait.h>
+#include <fmha/hopper/kernel_traits.h>
+#include <fmha/hopper/utils_warpgroup.h>
+#include <fused_multihead_attention_kernel.h>
+
+namespace fused_multihead_attention {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_traits, typename Params>
+inline __device__ void device_4xN_hopper_nl(Params const& params) {
+  // The instruction traits for P.
+  using Traits_p = typename Kernel_traits::Traits_p;
+  // The instruction traits for O.
+  using Traits_o = typename Kernel_traits::Traits_o;
+
+  // The description of the CTA tile for the 1st batched GEMM.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  // The description of the CTA tile for the 2nd batched GEMM.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = typename Traits_p::template Mma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = typename Traits_o::template Mma_tile<Cta_tile_o>;
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
+  // The shared memory tile to swizzle Q.
+  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
+  // The shared memory tile to swizzle K.
+  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
+  // The shared memory tile to swizzle O.
+  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
+
+  // The compute tile for P.
+  using Compute_tile_p = typename Kernel_traits::Compute_tile_p;
+
+  // The compute tile for o.
+  using Compute_tile_o = typename Kernel_traits::Compute_tile_o;
+
+  // Do we use LDGSTS for Q, K or V?
+  enum { USE_LDGSTS_Q = Kernel_traits::USE_LDGSTS_Q };
+
+  enum { USE_LDGSTS_K = Kernel_traits::USE_LDGSTS_K };
+
+  enum { USE_LDGSTS_V = Kernel_traits::USE_LDGSTS_V };
+
+  // Do we use LDGSTS for any of the 3 input matrices.
+  enum { USE_LDGSTS = USE_LDGSTS_Q || USE_LDGSTS_K || USE_LDGSTS_V };
+
+  // If either K or V uses LDGSTS, they cannot share a buffer.
+  static_assert(!(USE_LDGSTS_K || USE_LDGSTS_V) || !Kernel_traits::SHARE_SMEM_FOR_K_AND_V, "");
+
+  // Shared memory.
+  extern __shared__ char smem_[];
+
+  char* q_smem_ = &smem_[0];
+  // It is good to make sure the start address of SMEM is 1024B aligned.
+  q_smem_ = fmha::align_1024(q_smem_);
+  char* k_smem_ = &q_smem_[Smem_tile_q::BYTES_PER_TILE];
+  char* v_smem_ = &k_smem_[Smem_tile_k::BYTES_PER_TILE];
+  char* o_smem_ = &v_smem_[Smem_tile_v::BYTES_PER_TILE];
+  char* softmax_smem_ = &o_smem_[Smem_tile_o::BYTES_PER_TILE];
+
+  // we should make sure that SMEM address is 1024B aligned.
+
+  // The loop -- each CTA works on a different loop iteration.
+  int const loop = blockIdx.z;
+  // The block index for the batch.
+  int const bidb = blockIdx.y;
+  // The block index for the head.
+  int const bidh = blockIdx.x;
+  // The thread index.
+  int const tidx = threadIdx.x;
+
+  Single_cta<Kernel_traits::VERSION> const binfo(params, bidb, bidh, 0, tidx);
+  if (binfo.stop_early(loop)) {
+    return;
+  }
+
+  // Create the object to control the masks.
+  fmha::Mask_hopper<Traits_p, Cta_tile_p, Kernel_traits::MASK_VERSION> mask(params, binfo, tidx);
+
+  // Allocate the global memory tile loader for Q.
+  Gmem_tile_q gmem_q(params, 0, binfo, tidx, loop * Gmem_tile_q::ROWS);
+  // Allocate the shared memory tile loader for Q.
+  Smem_tile_q smem_q(q_smem_, tidx);
+
+  // Allocate the global memory tile loader for K.
+  Gmem_tile_k gmem_k(params, 1, binfo, tidx);
+  // Allocate the shared memory tile loader for K.
+  Smem_tile_k smem_k(k_smem_, tidx);
+
+  // Allocate the global memory tile loader for V.
+  Gmem_tile_v gmem_v(params, 2, binfo, tidx);
+  // Allocate the shared memory tile loader for V.
+  Smem_tile_v smem_v(v_smem_, tidx);
+
+  // Allocate the global memory tile loader for O.
+  Gmem_tile_o gmem_o(params, binfo, tidx, loop * Gmem_tile_o::ROWS);
+  // Allocate the shared memory tile loader for O.
+  Smem_tile_o smem_o(o_smem_, tidx);
+
+  // Trigger the loads for V.
+  gmem_v.load(smem_v);
+  // If needed, push the LDGDEPBAR instruction after the loads for V.
+  fmha::ldgdepbar<USE_LDGSTS && Smem_tile_v::TRANSPOSE>();
+
+  // Trigger the loads for Q at 0th STEP.
+  gmem_q.load(smem_q);
+
+  // Trigger the loads for K.
+  gmem_k.load(smem_k);
+
+  // Push the LDGDEPBAR instruction after the loads for Q and K.
+  fmha::ldgdepbar<USE_LDGSTS>();
+
+  if (Smem_tile_v::TRANSPOSE) {
+    // Wait for V to be available in SMEM: up to two ldgsts can be outstanding (q0+k above)
+    fmha::depbar_<USE_LDGSTS, 1>();
+    __syncthreads();
+    // For 8-bit data types we have to transpose V in SMEM to be in Column-major.
+    smem_v.transpose_tile(tidx);
+    // Fence to guarantee ordering between STSM and GMMA.
+    fmha::fence_view_async_shared();
+    // Not needed as we call it for BMM1.
+    // fmha::warpgroup_arrive();
+  }
+
+  // Store/load P to/from memory (for debugging).
+#if defined(STORE_P)
+  enum { BITS_PER_ELT_P = sizeof(typename Traits_p::Accumulator_type) * 8 };
+
+  using Gmem_tile_p = fmha::Gmem_tile_ps_hopper<Traits_p, Cta_tile_p, BITS_PER_ELT_P>;
+  char* p_ptr = reinterpret_cast<char*>(params.p_ptr);
+  p_ptr += loop * Cta_tile_p::M * params.p_stride_in_bytes;
+  Gmem_tile_p gmem_p(p_ptr, params.p_stride_in_bytes, params.scale_bmm1, tidx);
+#endif
+
+  // Store S to memory (for debugging). NOTE: We use A_type as C_type is int32 for IMMA???
+#if defined(STORE_S)
+  enum { BITS_PER_ELT_S = sizeof(typename Traits_p::A_type) * 8 };
+
+  using Gmem_tile_s = fmha::Gmem_tile_ps_hopper<Traits_p, Cta_tile_p, BITS_PER_ELT_S>;
+  char* s_ptr = reinterpret_cast<char*>(params.s_ptr);
+  s_ptr += loop * Cta_tile_p::M * params.s_stride_in_bytes;
+  Gmem_tile_s gmem_s(s_ptr, params.s_stride_in_bytes, params.scale_softmax, tidx);
+#endif
+
+  // Create the object to do the softmax.
+  using Softmax = fmha::Softmax<Traits_p, Cta_tile_p, Kernel_traits>;
+  // softmax for hopper should not require SMEM. Maybe pass a nullptr.
+  Softmax softmax(params, softmax_smem_, bidb, tidx);
+
+  // Make sure the data is in shared memory.
+  fmha::depbar<USE_LDGSTS_Q, 2>();
+  __syncthreads();  // At this point, no LDGSTS outstanding (N-2!)
+  // GEMM 0.
+
+  // Let's try to use compute_tile for now.
+  // Need to think about refactoring into gemm class [Timmy]
+
+  // compute_tile for P. ( should take care of the 64x512x64 tile. )
+  int warp = tidx / 32;
+  int warp_n = warp / 4;
+  // static_assert(Traits_p::GMMA_N == 192);
+  //  TODO how to set this up? Also GMMA_N % 8 = 0 to line up with XOR?
+  Compute_tile_p compute_tile_p(q_smem_, k_smem_ + warp_n * Cta_tile_p::K * Traits_p::GMMA_N *
+                                                       sizeof(typename Traits_p::B_type));
+  compute_tile_p.clear();
+
+  static_assert(Compute_tile_p::MMAS_N == 1);
+
+  // for now let's not pipeline GMMA yet.
+  // promise to compiler that data are ready in SMEM
+  fmha::warpgroup_arrive();
+#pragma unroll
+  for (int mmas_k_idx = 0; mmas_k_idx < Mma_tile_p::MMAS_K - 1; ++mmas_k_idx) {
+    compute_tile_p.compute(mmas_k_idx);
+  }
+  // Last GMMA increments score board.
+  compute_tile_p.compute(Mma_tile_p::MMAS_K - 1, true, true);
+  // All preceding GMMAs are finished.
+
+  // Load the mask for that iteration.
+  mask.load(loop);
+
+  fmha::warpgroup_wait<0>();
+
+  // Softmax.
+  // Store the P matrix.
+#if defined(STORE_P)
+  gmem_p.store(compute_tile_p.acc_);
+  gmem_p.move();
+#endif
+
+  // Convert from the accumulator type to FP32 for Softmax.
+  // Note that alpha is also applied here.
+  softmax.unpack(compute_tile_p.acc_);
+
+  // Apply the mask.
+  if (params.has_alibi) {
+    softmax.apply_mask_alibi(mask, bidh, params.alibi_params);
+  } else {
+    softmax.apply_mask(mask);
+  }
+
+  // Make sure we are done reading the data.
+  // For Hopper, most likely it is not shared.
+  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V && loop == 0) {
+    __syncthreads();
+  }
+
+  float p_max[Softmax::ROWS_PER_THREAD * Softmax::MMAS_M];
+  // Enable our trick to use the max for INT8 to scale.
+  if (Kernel_traits::USE_SCALE_MAX) {
+    // 16129 == 127 ^ 2.
+    // float p_max = reinterpret_cast<const float&>(params.scale_bmm1) * 16129.f;
+    // softmax.apply_exp(p_max);
+  } else {
+    // Compute the max.
+    softmax.template reduce<fmha::Max_>(p_max);
+
+    if (Cta_tile_p::WARPS_N > 1) {
+      // Inter warp reduction needed.
+      __syncthreads();
+    }
+    // Compute the exponential value.
+    softmax.apply_exp(p_max);
+  }
+
+  // Compute the sum.
+  float p_sum[Softmax::ROWS_PER_THREAD * Softmax::MMAS_M];
+  softmax.template reduce<fmha::Sum_>(p_sum);
+
+  // Finalize softmax on the accumulators of P^T.
+  softmax.scale(p_sum);
+
+  // Store the P matrix.
+#if defined(STORE_S)
+  softmax.store(gmem_s);
+  gmem_s.move();
+#endif
+
+  // GEMM 1.
+
+  // compute_tile for o. ( should take care of the 64x64xS tile. )
+  Compute_tile_o compute_tile_o(nullptr, v_smem_);
+  compute_tile_o.clear();
+
+  // Repack for the next BMM.
+  using Frag_a = fmha::Fragment_a<Traits_o, fmha::Row>;
+  Frag_a frag_s[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
+
+  static_assert(Frag_a::NUM_ELTS == 16 * Traits_o::GMMA_K / 32);
+  // static_assert(Mma_tile_o::MMAS_K == 6);
+  static_assert(Mma_tile_o::MMAS_M == 1);
+  // static_assert(Compute_tile_o::MMAS_K == 4);
+
+  // Fill frag_s with the results from softmax
+  softmax.pack(frag_s);
+
+  // for now let's not pipeline GMMA yet.
+  // promise to compiler that data are ready in SMEM
+  fmha::warpgroup_arrive();
+
+  compute_tile_o.compute_incta_splitk(frag_s, warp_n);
+  // all preceding GMMAs are finished.
+  fmha::warpgroup_wait<0>();
+
+// Loop over MMAS_M.
+#pragma unroll
+  for (int ii = 0; ii < Gmem_tile_o::LOOPS; ++ii) {
+    // Swizzle the elements and do the final reduction.
+    smem_o.store(compute_tile_o.acc_, ii);
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+
+    // Load from shared memory.
+    uint4 out[Gmem_tile_o::STGS_PER_LOOP];
+    smem_o.load(out);
+
+    // Make sure the data was read from shared memory.
+    if (ii < Gmem_tile_o::LOOPS - 1) {
+      __syncthreads();
+    }
+
+    // Output the values.
+    gmem_o.store(out, ii);
+  }
+
+  if (params.softmax_stats_ptr != nullptr) {
+    using Mma_tile = typename Traits_p::template Mma_tile<Cta_tile_o>;
+    fmha::Softmax_saver<Cta_tile_o, Mma_tile> saver(params, binfo);
+    // float scale_bmm1 = Kernel_traits::USE_SCALE_MAX ? reinterpret_cast<const
+    // float&>(params.scale_bmm1) : 0.0;//TODO
+    saver.store(loop, p_sum, p_max);
+  }
+
+#ifdef DEBUG_HAS_PRINT_BUFFER
+  int lane = tidx % 32;
+  using Acc = fmha::Fragment_accumulator<Traits_o>;
+  float* ptr = reinterpret_cast<float*>(params.print_ptr);
+  float* ptr_o = reinterpret_cast<float*>(o_smem_);
+  if (loop == 0 && warp_n < 2 && lane == 0) {
+    ptr[0 + warp_n] = compute_tile_o.acc_[0][0].elt(0);
+    ptr[2 + warp_n] = compute_tile_o.acc_[0][0].elt(4);
+    if (warp_n == 0) {
+      fmha::e4m3_t* bla = reinterpret_cast<fmha::e4m3_t*>(v_smem_);
+      float tmp = 0.f;
+      for (int it = 0; it < 128; it++) {
+        tmp += float(bla[it]);
+      }
+
+      bla = &bla[64 * 128];
+      for (int it = 0; it < 64; it++) {
+        tmp += float(bla[it]);
+      }
+      ptr[4 + warp_n] = tmp / 384.f;
+    } else {
+      fmha::e4m3_t* bla = &reinterpret_cast<fmha::e4m3_t*>(v_smem_)[64 * 128 + 64];
+      float tmp = 0.f;
+
+      for (int it = 0; it < 64; it++) {
+        tmp += float(bla[it]);
+      }
+
+      bla = &bla[64 * 128];
+
+      for (int it = 0; it < 128; it++) {
+        tmp += float(bla[it]);
+      }
+
+      ptr[4 + warp_n] = tmp / 384.f;
+    }
+    // ptr[4 + warp_n * 2 + 0] = reinterpret_cast<const float&>(out[0].x);
+    // ptr[4 + warp_n * 2 + 1] = reinterpret_cast<const float&>(out[0].y);
+  }
+  if (tidx == 0 && loop == 0) {
+    ptr[6 + 0] = ptr_o[0];
+    ptr[6 + 1] = ptr_o[64];
+
+    ptr[6 + 2] = ptr_o[1];
+    ptr[6 + 3] = ptr_o[65];
+  }
+  __syncthreads();
+
+#endif
+
+}  // kernel
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fused_multihead_attention
diff --git a/csrc/fmha_v2/fused_multihead_attention_utils.h b/csrc/fmha_v2/fused_multihead_attention_utils.h
new file mode 100644
index 0000000000..b4bb063c47
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_attention_utils.h
@@ -0,0 +1,1472 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_fp8.h>
+#include <cuda_runtime_api.h>
+#include <fmha/numeric_types.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <cfloat>
+#include <string>
+#include <vector>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define FMHA_CHECK_CUDA(call)                                         \
+  do {                                                                \
+    cudaError_t status_ = call;                                       \
+    if (status_ != cudaSuccess) {                                     \
+      fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, \
+              cudaGetErrorString(status_));                           \
+      exit(1);                                                        \
+    }                                                                 \
+  } while (0)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static char const* _cudaGetErrorEnum(cublasStatus_t error) {
+  switch (error) {
+    case CUBLAS_STATUS_SUCCESS:
+      return "CUBLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+  return "<unknown>";
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define FMHA_CHECK_CUBLAS(call)                                                          \
+  do {                                                                                   \
+    cublasStatus_t status_ = call;                                                       \
+    if (status_ != CUBLAS_STATUS_SUCCESS) {                                              \
+      fprintf(stderr, "CUBLAS error %d (%s:%d): %s\n", (int)status_, __FILE__, __LINE__, \
+              _cudaGetErrorEnum(status_));                                               \
+      exit(1);                                                                           \
+    }                                                                                    \
+  } while (0)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void random_init(char const* name, float* dst, size_t m, size_t n, size_t ld,
+                               bool use_1s, int range, float scale, bool verbose) {
+  if (verbose) {
+    printf("Init .........: %s\n", name);
+    printf("Use 1s .......: %s\n", use_1s ? "true" : "false");
+    printf("Address ......: 0x%016lx\n", (size_t)dst);
+    printf("Range ........: %d\n", range);
+    printf("Scale ........: %.3f\n", scale);
+    printf("Values .......: ");
+  }
+  for (size_t ni = 0; ni < n; ++ni) {
+    for (size_t mi = 0; mi < m; ++mi) {
+      float x = 1.f;
+      if (!use_1s) {
+        x = (float)(rand() % range - range / 2) * scale;
+      }
+      if (verbose && ni * m + mi < 8) {
+        printf("%.3f ", x);
+      }
+      dst[ni * ld + mi] = x;
+    }
+  }
+  if (verbose) {
+    printf("...\n\n");
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Sequentially increasing values in either m or n dimension
+static inline void iota_init(char const* name, float* dst, size_t m, size_t n, size_t ld,
+                             bool use_1s,
+                             int range,    // ignore
+                             float scale,  // ignore
+                             bool verbose, bool iota_m = true) {
+  if (verbose) {
+    printf("Init .........: %s\n", name);
+    printf("Use 1s .......: %s\n", use_1s ? "true" : "false");
+    printf("Address ......: 0x%016lx\n", (size_t)dst);
+    printf("Values .......: \n");
+  }
+  for (size_t ni = 0; ni < n; ++ni) {
+    if (verbose && ni < 32) printf("ni %zd: ", ni);
+    for (size_t mi = 0; mi < m; ++mi) {
+      float x = iota_m ? mi : ni;
+      x = use_1s ? 1.f : x;
+      if (verbose && ni < 32 && mi < 16) {
+        printf("%2.0f ", x);
+      }
+      dst[ni * ld + mi] = x;
+    }
+    if (verbose && ni < 32) printf("\n");
+  }
+  if (verbose) {
+    printf("...\n\n");
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void random_init_with_zeroes_or_ones(float* dst, size_t n, bool use_1s, float prob_1s,
+                                                   bool verbose) {
+  if (verbose) {
+    printf("Use 1s .......: %s\n", use_1s ? "true" : "false");
+    printf("Address ......: 0x%016lx\n", (size_t)dst);
+    printf("Prob 1s ......: %.3f\n", prob_1s);
+    printf("Values .......: ");
+  }
+  for (size_t ni = 0; ni < n; ++ni) {
+    float x = 1.f;
+    if (!use_1s) {
+      x = ((double)rand() / (double)RAND_MAX) < (double)prob_1s ? 1.f : 0.f;
+    }
+    if (verbose && ni < 8) {
+      printf("%.3f ", x);
+    }
+    dst[ni] = x;
+  }
+  if (verbose) {
+    printf("...\n");
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum Data_type {
+  DATA_TYPE_FP16 = 0,
+  DATA_TYPE_FP32 = 1,
+  DATA_TYPE_INT32 = 2,
+  DATA_TYPE_INT8 = 3,
+  DATA_TYPE_BF16 = 4,
+  DATA_TYPE_E4M3 = 5,
+  DATA_TYPE_E5M2 = 6
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline size_t get_size_in_bytes(size_t n, Data_type dtype) {
+  switch (dtype) {
+    case DATA_TYPE_FP32:
+      return n * 4;
+    case DATA_TYPE_FP16:
+      return n * 2;
+    case DATA_TYPE_INT32:
+      return n * 4;
+    case DATA_TYPE_INT8:
+      return n;
+    case DATA_TYPE_BF16:
+      return n * 2;
+    case DATA_TYPE_E4M3:
+      return n;
+    case DATA_TYPE_E5M2:
+      return n;
+    default:
+      assert(false);
+      return 0;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void set_alpha(uint32_t& alpha, float norm, Data_type dtype) {
+  if (dtype == DATA_TYPE_FP16) {
+    half x = __float2half_rn(norm);
+    uint16_t h = reinterpret_cast<uint16_t const&>(x);
+    ushort2 h2 = {h, h};
+    alpha = reinterpret_cast<uint32_t const&>(h2);
+  } else if (dtype == DATA_TYPE_FP32) {
+    alpha = reinterpret_cast<uint32_t const&>(norm);
+  } else if (dtype == DATA_TYPE_INT32) {
+    int32_t inorm = static_cast<int32_t>(norm);
+    alpha = reinterpret_cast<uint32_t const&>(inorm);
+  } else if (dtype == DATA_TYPE_BF16) {
+    // TODO HACK!! BF16 Outputs are computed in FP32 for FP8.
+    // This is because cublas does not allow current FP32 output.
+    //  alpha = reinterpret_cast<const uint32_t &>( norm );
+    __nv_bfloat16 x = __float2bfloat16(norm);
+    uint16_t h = reinterpret_cast<uint16_t const&>(x);
+    ushort2 h2 = {h, h};
+    alpha = reinterpret_cast<uint32_t const&>(h2);
+  } else {
+    assert(false);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Dst, typename Src>
+static inline void expand_and_transpose_input(void* dst_, void* src_,
+                                              std::vector<uint32_t> const& seqlens, int const s,
+                                              int const b, int const h, int const d) {
+  // input comes in sxbxhx3xd
+  // output will be b tensors of size s_ixhx3xd
+  Dst* dst = static_cast<Dst*>(dst_);
+  Src* src = static_cast<Src*>(src_);
+  for (int bi = 0; bi < b; bi++) {
+    int s_ = seqlens[bi];
+    for (int si = 0; si < s_; si++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int ti = 0; ti < 3; ti++) {
+          for (int di = 0; di < d; di++) {
+            size_t out_idx = size_t(si) * b * h * 3 * d + bi * h * 3 * d + hi * 3 * d + ti * d + di;
+            dst[out_idx] = *src;
+            *src++;
+          }
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+static inline void extract_and_transpose_input(void* dst_, void* src_,
+                                               std::vector<uint32_t> const& seqlens, int const s,
+                                               int const b, int const h, int const d, int const t,
+                                               bool const s_padded = false) {
+  // input comes in sxbxhxtxd
+  // output will be b tensors of size s_ixhxtxd
+  T* dst = static_cast<T*>(dst_);
+  T* src = static_cast<T*>(src_);
+  for (int bi = 0; bi < b; bi++) {
+    int s_ = s_padded ? s : seqlens[bi];
+    for (int si = 0; si < s_; si++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int ti = 0; ti < t; ti++) {
+          for (int di = 0; di < d; di++) {
+            size_t in_idx = size_t(si) * b * h * t * d + bi * h * t * d + hi * t * d + ti * d + di;
+            dst[0] = src[in_idx];
+            dst++;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static inline void extract_and_transpose_input(void* dst_, void* src_,
+                                               std::vector<uint32_t> const& seqlens, int const s,
+                                               int const b, int const h, int const d, int const dv,
+                                               int const t, bool const s_padded = false) {
+  assert(t == 3);
+  // input comes in sxbxhxtxd
+  // output will be b tensors of size s_ixhxtxd
+  T* dst = static_cast<T*>(dst_);
+  T* src = static_cast<T*>(src_);
+#if 1  // HEADS_INTERLEAVED
+  for (int bi = 0; bi < b; bi++) {
+    int s_ = s_padded ? s : seqlens[bi];
+    for (int si = 0; si < s_; si++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int di = 0; di < 2 * d + dv; di++) {
+          size_t in_idx =
+              size_t(si) * b * h * (2 * d + dv) + bi * h * (2 * d + dv) + hi * (2 * d + dv) + di;
+          dst[0] = src[in_idx];
+          dst++;
+        }
+      }
+    }
+  }
+#else
+  for (int bi = 0; bi < b; bi++) {
+    int s_ = s_padded ? s : seqlens[bi];
+    for (int si = 0; si < s_; si++) {
+      for (int qkv = 0; qkv < 3; qkv++) {
+        int d_ = qkv == 2 ? dv : d;
+        for (int hi = 0; hi < h; hi++) {
+          for (int di = 0; di < d_; di++) {
+            size_t in_idx = size_t(si) * b * h * (2 * d + dv) + bi * h * (2 * d + dv) +
+                            hi * (2 * d + dv) + qkv * d + di;
+            dst[0] = src[in_idx];
+            dst++;
+          }
+        }
+      }
+    }
+  }
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+static inline void set_mat(void* mat_, std::vector<uint32_t> const& seqlens, int const s,
+                           int const b, int const h, int const d, T const val, bool inner = false) {
+  assert((int64_t)(s * b * h * d) == (int64_t)s * b * h * d);
+
+  // s x b x h x d
+  T* mat = static_cast<T*>(mat_);
+  for (int si = 0; si < s; si++) {
+    for (int bi = 0; bi < b; bi++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int di = 0; di < d; di++) {
+          int idx = si * b * h * d + bi * h * d + hi * d + di;
+          if (si >= seqlens[bi] || (inner && di >= seqlens[bi])) {
+            mat[idx] = val;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static inline void set_mat(void* mat_, std::vector<uint32_t> const& seqlens_q,
+                           std::vector<uint32_t> const& seqlens_kv, int const s, int const b,
+                           int const h, int const d, T const val, bool inner = false) {
+  assert((int64_t)(s * b * h * d) == (int64_t)s * b * h * d);
+
+  // s x b x h x d
+  T* mat = static_cast<T*>(mat_);
+  for (int si = 0; si < s; si++) {
+    for (int bi = 0; bi < b; bi++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int di = 0; di < d; di++) {
+          int idx = si * b * h * d + bi * h * d + hi * d + di;
+          if (si >= seqlens_q[bi] || (inner && di >= seqlens_kv[bi])) {
+            mat[idx] = val;
+          }
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+static inline void extract_and_transpose_output(void* dst_, void* src_,
+                                                std::vector<uint32_t> const& seqlens, int const s,
+                                                int const b, int const h, int const d,
+                                                bool const s_padded = false) {
+  T* dst = static_cast<T*>(dst_);
+  T* src = static_cast<T*>(src_);
+  for (int bi = 0; bi < b; bi++) {
+    int s_ = s_padded ? s : seqlens[bi];
+    for (int si = 0; si < s_; si++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int di = 0; di < d; di++) {
+          int in_idx = si * b * h * d + bi * h * d + hi * d + di;
+          *dst = src[in_idx];
+          dst++;
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void store_q_and_contiguous_kv_cache(void* q_d,                    // [B, S, H, D]
+                                                   void* k_d,                    // [B, S, H_kv, D]
+                                                   void* v_d,                    // [B, S, H_kv, Dv]
+                                                   void* contiguous_kv_h,        // [B, S, 2, H, D]
+                                                   void* contiguous_kv_d,        // [B, S, 2, H, D]
+                                                   float const* qkv_packed_src,  // [B, S, H, 3, D]
+                                                   int const* cu_kv_seqlens,     // [B + 1]
+                                                   int const* cu_q_seqlens,      // [B + 1]
+                                                   size_t b, size_t s, size_t h_q, size_t h_kv,
+                                                   size_t d, size_t dv, Data_type dtype) {
+  // Handle Q.
+  int const total_q_seqlen = cu_q_seqlens[b];
+  size_t q_sz = get_size_in_bytes(total_q_seqlen * h_q * d, dtype);
+  void* q_tmp = (void*)malloc(q_sz);
+  for (size_t bi = 0; bi < b; bi++) {
+    int q_length = cu_q_seqlens[bi + 1] - cu_q_seqlens[bi];
+    int kv_length = cu_kv_seqlens[bi + 1] - cu_kv_seqlens[bi];
+    for (size_t si = 0; si < q_length; si++) {
+      for (size_t hi = 0; hi < h_q; hi++) {
+        for (size_t di = 0; di < d; di++) {
+          size_t src_offset = (cu_kv_seqlens[bi] + kv_length - q_length + si) * h_q * (2 * d + dv) +
+                              hi * (2 * d + dv) + di;
+          size_t dst_offset = (cu_q_seqlens[bi] + si) * h_q * d + hi * d + di;
+          switch (dtype) {
+            case DATA_TYPE_FP16:
+              reinterpret_cast<half*>(q_tmp)[dst_offset] = half(qkv_packed_src[src_offset]);
+              break;
+            case DATA_TYPE_BF16:
+              reinterpret_cast<__nv_bfloat16*>(q_tmp)[dst_offset] =
+                  __float2bfloat16(qkv_packed_src[src_offset]);
+              break;
+            case DATA_TYPE_E4M3:
+              reinterpret_cast<__nv_fp8_e4m3*>(q_tmp)[dst_offset] =
+                  __nv_fp8_e4m3(qkv_packed_src[src_offset]);
+              break;
+            default:
+              assert(false);
+          }
+        }
+      }
+    }
+  }
+  FMHA_CHECK_CUDA(cudaMemcpy(q_d, q_tmp, q_sz, cudaMemcpyDefault));
+  free(q_tmp);
+
+  // Handle contiguous KV [B, S, 2, H, D].
+  // Group head size.
+  int h_q_per_kv = h_q / h_kv;
+  // The total number of kv tokens.
+  size_t const total_kv_tokens = cu_kv_seqlens[b];
+  // The kv cache size in bytes.
+  size_t const kv_size_in_bytes = get_size_in_bytes(total_kv_tokens * h_kv * (d + dv), dtype);
+  // Handle Separate K and V.
+  size_t k_size_in_bytes = get_size_in_bytes(total_kv_tokens * h_kv * d, dtype);
+  void* k_h = (void*)malloc(k_size_in_bytes);
+  size_t v_size_in_bytes = get_size_in_bytes(total_kv_tokens * h_kv * dv, dtype);
+  void* v_h = (void*)malloc(v_size_in_bytes);
+
+  // Batch size.
+  for (size_t bi = 0; bi < b; bi++) {
+    // The current cumulative sequence length offset.
+    int const seqlen_offset = cu_kv_seqlens[bi];
+    // The actual kv sequence length.
+    int const actual_kv_seqlen = cu_kv_seqlens[bi + 1] - cu_kv_seqlens[bi];
+    // [B, S, H, 3, D]
+    float const* kv_packed_src = qkv_packed_src + seqlen_offset * h_q * (2 * d + dv);
+    // Head.
+    for (size_t hi = 0; hi < h_kv; hi++) {
+      // Sequence.
+      for (size_t si = 0; si < actual_kv_seqlen; si++) {
+        // K
+        size_t dst_k_offset_1 = (seqlen_offset + si) * h_kv * (d + dv) + hi * d;
+        size_t dst_k_offset_2 = (seqlen_offset + si) * h_kv * d + hi * d;
+        size_t src_k_offset = (si * h_q + hi * h_q_per_kv) * (2 * d + dv) + d;
+        for (size_t di = 0; di < d; di++) {
+          switch (dtype) {
+            case DATA_TYPE_FP16:
+              reinterpret_cast<half*>(contiguous_kv_h)[dst_k_offset_1 + di] =
+                  reinterpret_cast<half*>(k_h)[dst_k_offset_2 + di] =
+                      half(kv_packed_src[src_k_offset + di]);
+              break;
+            case DATA_TYPE_BF16:
+              reinterpret_cast<__nv_bfloat16*>(contiguous_kv_h)[dst_k_offset_1 + di] =
+                  reinterpret_cast<__nv_bfloat16*>(k_h)[dst_k_offset_2 + di] =
+                      __float2bfloat16(kv_packed_src[src_k_offset + di]);
+              break;
+            case DATA_TYPE_E4M3:
+              reinterpret_cast<__nv_fp8_e4m3*>(contiguous_kv_h)[dst_k_offset_1 + di] =
+                  reinterpret_cast<__nv_fp8_e4m3*>(k_h)[dst_k_offset_2 + di] =
+                      __nv_fp8_e4m3(kv_packed_src[src_k_offset + di]);
+              break;
+            default:
+              assert(false);
+          }
+        }
+        // V
+        size_t dst_v_offset_1 = (seqlen_offset + si) * h_kv * (d + dv) + h_kv * d + hi * dv;
+        size_t dst_v_offset_2 = (seqlen_offset + si) * h_kv * dv + hi * dv;
+        size_t src_v_offset = src_k_offset + d;
+        for (size_t di = 0; di < dv; di++) {
+          switch (dtype) {
+            case DATA_TYPE_FP16:
+              reinterpret_cast<half*>(contiguous_kv_h)[dst_v_offset_1 + di] =
+                  reinterpret_cast<half*>(v_h)[dst_v_offset_2 + di] =
+                      half(kv_packed_src[src_v_offset + di]);
+              break;
+            case DATA_TYPE_BF16:
+              reinterpret_cast<__nv_bfloat16*>(contiguous_kv_h)[dst_v_offset_1 + di] =
+                  reinterpret_cast<__nv_bfloat16*>(v_h)[dst_v_offset_2 + di] =
+                      __float2bfloat16(kv_packed_src[src_v_offset + di]);
+              break;
+            case DATA_TYPE_E4M3:
+              reinterpret_cast<__nv_fp8_e4m3*>(contiguous_kv_h)[dst_v_offset_1 + di] =
+                  reinterpret_cast<__nv_fp8_e4m3*>(v_h)[dst_v_offset_2 + di] =
+                      __nv_fp8_e4m3(kv_packed_src[src_v_offset + di]);
+              break;
+            default:
+              assert(false);
+          }
+        }
+      }
+    }
+  }
+
+  FMHA_CHECK_CUDA(
+      cudaMemcpy(contiguous_kv_d, contiguous_kv_h, kv_size_in_bytes, cudaMemcpyDefault));
+  FMHA_CHECK_CUDA(cudaMemcpy(k_d, k_h, k_size_in_bytes, cudaMemcpyDefault));
+  FMHA_CHECK_CUDA(cudaMemcpy(v_d, v_h, v_size_in_bytes, cudaMemcpyDefault));
+  free(k_h);
+  free(v_h);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void store_paged_kv_cache(
+    void** paged_kv_cache_ptrs,   // [B, 2, M] with [H, S_per_B, Dh]
+    float const* qkv_packed_src,  // [B, S, H, 3, Dh]
+    int const* cu_kv_seqlens,     // [B + 1]
+    size_t max_blocks_per_seq, size_t tokens_per_block, size_t b, size_t h_q, size_t h_kv, size_t d,
+    size_t dv, Data_type dtype) {
+  // Handle paged KV.
+  void *k_tmp, *v_tmp;  // [H, S_per_B, Dh]
+  size_t sz_k = get_size_in_bytes(tokens_per_block * h_kv * d, dtype);
+  size_t sz_v = get_size_in_bytes(tokens_per_block * h_kv * dv, dtype);
+  k_tmp = (void*)malloc(sz_k);
+  v_tmp = (void*)malloc(sz_v);
+
+  int h_q_per_kv = h_q / h_kv;
+  for (size_t bi = 0; bi < b; bi++) {
+    int const actual_kv_seqlen = cu_kv_seqlens[bi + 1] - cu_kv_seqlens[bi];
+    size_t const num_blocks = (actual_kv_seqlen + tokens_per_block - 1) / tokens_per_block;
+    float const* kv_packed_src = qkv_packed_src + cu_kv_seqlens[bi] * h_q * (2 * d + dv);
+    for (size_t block_idx = 0; block_idx < num_blocks; block_idx++) {
+      memset(k_tmp, 0, sz_k);
+      memset(v_tmp, 0, sz_v);
+      size_t seq_bound =
+          std::min(tokens_per_block, actual_kv_seqlen - block_idx * tokens_per_block);
+      for (size_t hi = 0; hi < h_kv; hi++) {
+        for (size_t si = 0; si < seq_bound; si++) {
+          auto copy_vector = [&](void* block, size_t vector_d, int qkv_offset) {
+            size_t src_offset = (si + block_idx * tokens_per_block) * h_q * (2 * d + dv) +
+                                hi * h_q_per_kv * (2 * d + dv) + qkv_offset * d;
+            size_t dst_offset = hi * tokens_per_block * vector_d + si * vector_d;
+            for (size_t di = 0; di < vector_d; di++) {
+              switch (dtype) {
+                case DATA_TYPE_FP16:
+                  reinterpret_cast<half*>(block)[dst_offset + di] =
+                      half(kv_packed_src[src_offset + di]);
+                  break;
+                case DATA_TYPE_BF16:
+                  reinterpret_cast<__nv_bfloat16*>(block)[dst_offset + di] =
+                      __float2bfloat16(kv_packed_src[src_offset + di]);
+                  break;
+                case DATA_TYPE_E4M3:
+                  reinterpret_cast<__nv_fp8_e4m3*>(block)[dst_offset + di] =
+                      __nv_fp8_e4m3(kv_packed_src[src_offset + di]);
+                  break;
+                default:
+                  assert(false);
+              }
+            }
+          };
+          copy_vector(k_tmp, d, 1);
+          copy_vector(v_tmp, dv, 2);
+        }
+      }
+      FMHA_CHECK_CUDA(cudaMemcpy(paged_kv_cache_ptrs[block_idx], k_tmp, sz_k, cudaMemcpyDefault));
+      FMHA_CHECK_CUDA(cudaMemcpy(paged_kv_cache_ptrs[max_blocks_per_seq + block_idx], v_tmp, sz_v,
+                                 cudaMemcpyDefault));
+    }
+    paged_kv_cache_ptrs += 2 * max_blocks_per_seq;
+  }
+
+  free(k_tmp);
+  free(v_tmp);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+static inline void extract_and_transpose_output(void* dst_, void* src_,
+                                                std::vector<uint32_t> const& seqlens,
+                                                std::vector<uint32_t> const& q_seqlens, int const s,
+                                                int const q_seqlen, int const b, int const h,
+                                                int const d, bool const s_padded = false) {
+  T* dst = static_cast<T*>(dst_);
+  T* src = static_cast<T*>(src_);
+  for (int bi = 0; bi < b; bi++) {
+    int s_ = s_padded ? s : seqlens[bi];
+    // only consider the chunked q tile.
+    int q_seqlen_ = s_padded ? q_seqlen : q_seqlens[bi];
+    for (int si = std::max(s_ - q_seqlen_, 0); si < s_; si++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int di = 0; di < d; di++) {
+          int in_idx = si * b * h * d + bi * h * d + hi * d + di;
+          *dst = src[in_idx];
+          dst++;
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+static inline void expand_and_transpose_output(void* dst_, void* src_,
+                                               std::vector<uint32_t> const& seqlens, int const s,
+                                               int const b, int const h, int const d) {
+  T* dst = static_cast<T*>(dst_);
+  T* src = static_cast<T*>(src_);
+  for (int bi = 0; bi < b; bi++) {
+    int s_ = seqlens[bi];
+    for (int si = 0; si < s_; si++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int di = 0; di < d; di++) {
+          int out_idx = si * b * h * d + bi * h * d + hi * d + di;
+          dst[out_idx] = *src;
+          src++;
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+int eval(T* ref, T* test, std::vector<uint32_t> const& seqlens, int const B, int const S,
+         int const N, int const H, bool verbose, int const print_n = 0) {
+  size_t errors = 0;
+  for (int b = 0; b < B; b++) {
+    int actual_seqlen = seqlens[b];
+    for (int s = 0; s < actual_seqlen; s++) {
+      for (int n = 0; n < N; n++) {
+        for (int h = 0; h < H; h++) {
+          // int it = b * S * N * H + n * H + s * N * H + h;
+          int it = s * B * N * H + n * H + b * N * H + h;
+
+          int x = ref[it];
+          int y = test[it];
+          if (errors < print_n && x != y) {
+            printf("%6d: %d, %d [%d,%d,%d,%d]\n", it, x, y, b, s, n, h);
+          }
+          errors += (x != y);
+        }
+      }
+    }
+  }
+
+  if (verbose) {
+    printf("Size .........: %d\n", B * S * N * H);
+    printf("Errors .......: %lu\n", errors);
+  }
+  return errors > 0 ? 1 : 0;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+void x_vec32(bool const to, T* src_dst, int h, int total, int mats, int d = 64) {
+  // to=true:  total x h x mats x d => mats x h x (d/32) x total x 32
+  // to=false: mats x h x (d/32) x total x 32 => total x h x mats x d
+  std::vector<T> tmp(total * h * mats * d);
+
+  int slices = d / 32;
+  for (int si = 0; si < total; si++) {
+    for (int hi = 0; hi < h; hi++) {
+      for (int mi = 0; mi < mats; mi++) {
+        for (int di = 0; di < d; di++) {
+          int slice = di / 32;
+          int ii = di % 32;
+          int src_idx = si * h * mats * d + hi * mats * d + mi * d + di;
+          int dst_idx = mi * h * slices * total * 32 + hi * slices * total * 32 +
+                        slice * total * 32 + si * 32 + ii;
+          if (to) {
+            tmp[dst_idx] = src_dst[src_idx];
+          } else {  // from
+            tmp[src_idx] = src_dst[dst_idx];
+          }
+        }
+      }
+    }
+  }
+  for (auto x : tmp) {
+    *src_dst = x;
+    src_dst++;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct CudaDevice {
+  CudaDevice() {
+    FMHA_CHECK_CUDA(cudaGetDeviceProperties(&props, 0));
+    sm = props.major * 10 + props.minor;
+  }
+
+  ~CudaDevice() {
+    // WAR: DON'T CALL THIS as it will mess up PyTorch cuda context.
+    // FMHA_CHECK_CUDA(cudaDeviceReset());
+  }
+
+  cudaDeviceProp props;
+  int sm;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct GpuTimer {
+  GpuTimer() {
+    FMHA_CHECK_CUDA(cudaEventCreate(&begin));
+    FMHA_CHECK_CUDA(cudaEventCreate(&end));
+  }
+
+  ~GpuTimer() {
+    FMHA_CHECK_CUDA(cudaEventDestroy(begin));
+    FMHA_CHECK_CUDA(cudaEventDestroy(end));
+  }
+
+  inline void start() { FMHA_CHECK_CUDA(cudaEventRecord(begin)); }
+
+  inline void stop() { FMHA_CHECK_CUDA(cudaEventRecord(end)); }
+
+  inline float millis() {
+    float ms = 0;
+    FMHA_CHECK_CUDA(cudaEventElapsedTime(&ms, begin, end));
+    return ms;
+  }
+
+  cudaEvent_t begin, end;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct dvec {
+  T* data;
+  size_t size;
+  size_t bytes;
+
+  dvec(size_t sz) : size(sz), bytes(sizeof(T) * sz) { FMHA_CHECK_CUDA(cudaMalloc(&data, bytes)); }
+
+  ~dvec() { FMHA_CHECK_CUDA(cudaFree(data)); }
+
+  template <typename U>
+  U* get() {
+    return reinterpret_cast<U*>(data);
+  }
+
+  void fill(T const val) { FMHA_CHECK_CUDA(cudaMemset(data, val, bytes)); }
+
+  void zeros() { fill(T(0)); }
+
+  void h2d(std::vector<T> const& v) {
+    assert(v.size() == size);
+    FMHA_CHECK_CUDA(cudaMemcpy(data, v.data(), bytes, cudaMemcpyHostToDevice));
+  }
+
+  void d2h(std::vector<T>& v) {
+    assert(v.size() == size);
+    FMHA_CHECK_CUDA(cudaMemcpy(v.data(), data, bytes, cudaMemcpyDeviceToHost));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void pack_flash_attention_mask(
+    uint32_t* packed_mask_h,
+    // mask dims [s_q, b, s_kv]
+    float const* mask_h, size_t const b, size_t const s, size_t const warps_m, size_t const warps_n,
+    size_t const threads_per_cta, size_t const mmas_n, size_t const core_mmas_n,
+    // The mask_h (b x s x s) row offset to support s_q < s_kv.
+    int const* mask_h_row_offsets,
+    // Cumulative mask sequence lengths.
+    int const* cu_mask_rows) {
+  // Each core MMA_N = 8, and each thread holds 32bits as one packed mask (2 rows, 16 cols).
+  // All packed mask units of one warp group are coalesced, and then repeated along the
+  // col dimension, which means there will be 128 (num of threads) * 32 bits (one packed mask)
+  // stride for each 16 cols. This is designed to have coalesced memory access for each
+  // warp.
+  // Layout:
+  //  0 ~ 15 cols: t0, t1, t2, t3, ...., t127, t0,...,t127,....
+  // 16 ~ 31 cols: t0, t1, t2, t3, ...., t127, t0,...,t127,....
+  // ....
+
+  // Generate the packed mask. We use a gather approach.
+  for (size_t bi = 0; bi < b; ++bi) {
+    // The mask_h row offset as we only need the last s_q rows.
+    int mask_h_row_offset = mask_h_row_offsets[bi];
+    // The actual mask sequence length in the M dimension.
+    int actual_mask_seqlen = cu_mask_rows[bi + 1] - cu_mask_rows[bi];
+    // The actual mmas_m for this sequence.
+    // Note all mask_seqlens have been rounded up to multiple of 128.
+    int mmas_m = actual_mask_seqlen / (warps_m * 16);
+    // The cumulative mmas_m.
+    int cu_mmas_m = cu_mask_rows[bi] / (warps_m * 16);
+    // Iterate over the mmas_m, mmas_n, threads.
+    for (size_t mi = 0; mi < mmas_m; ++mi) {
+      for (size_t ni = 0; ni < mmas_n; ++ni) {
+        for (size_t tidx = 0; tidx < threads_per_cta; ++tidx) {
+          // The warp position.
+          size_t warp = tidx / 32;
+          size_t lane = tidx % 32;
+
+          // The warp index.
+          size_t warp_m = warp % warps_m;
+          size_t warp_n = warp / warps_m;
+
+          // The row/col of the 1st element for that MMA.
+          size_t row = warp_m * 16 + lane / 4;
+          size_t col = warp_n * 16 + lane % 4 * 2;
+
+          // Take the mmas_m, mmas_n into account.
+          row += (mi * warps_m * 16 + mask_h_row_offset);
+          col += ni * core_mmas_n * 8;
+
+          // The offset to the 1st element computed by that thread in the mask.
+          size_t offset = row * b * s + bi * s + col;
+
+          // The mask for each row of MMAs.
+          uint32_t mask = 0u;
+
+          // Iterate over the core mmas in the N dimension.
+          for (size_t nni = 0; nni < core_mmas_n; ++nni, offset += 8 * warps_n) {
+            bool valid_mask[4] = {row < s && col < s, row < s && (col + 1) < s,
+                                  (row + 8) < s && col < s, (row + 8) < s && (col + 1) < s};
+
+            mask |= (valid_mask[0] && mask_h[offset + 0 * b * s + 0] == 1.f ? 1u : 0u)
+                    << (4 * nni + 0);
+            mask |= (valid_mask[1] && mask_h[offset + 0 * b * s + 1] == 1.f ? 1u : 0u)
+                    << (4 * nni + 1);
+            mask |= (valid_mask[2] && mask_h[offset + 8 * b * s + 0] == 1.f ? 1u : 0u)
+                    << (4 * nni + 2);
+            mask |= (valid_mask[3] && mask_h[offset + 8 * b * s + 1] == 1.f ? 1u : 0u)
+                    << (4 * nni + 3);
+          }
+
+          // The offset of uint32_t packed mask.
+          size_t m_offset = (cu_mmas_m + mi) * mmas_n * threads_per_cta;
+          size_t n_offset = ni * threads_per_cta;
+          packed_mask_h[m_offset + n_offset + tidx] = mask;
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void pack_mask(uint32_t* packed_mask_h, float const* mask_h, size_t const s,
+                             size_t const b, size_t const mmas_m, size_t const mmas_n,
+                             size_t const warps_m, size_t const warps_n,
+                             size_t const threads_per_cta) {
+  // Generate the packed mask. We use a gather approach.
+  for (size_t bi = 0; bi < b; ++bi) {
+    for (size_t mi = 0; mi < mmas_m; ++mi) {
+      for (size_t tidx = 0; tidx < threads_per_cta; ++tidx) {
+        // The warp position.
+        size_t warp = tidx / 32;
+        size_t lane = tidx % 32;
+
+        // The warp index.
+        size_t warp_m = warp % warps_m;
+        size_t warp_n = warp / warps_m;
+
+        // The row/col of the 1st element for that MMA.
+        size_t row = warp_m * 16 + lane / 4;
+        size_t col = warp_n * 16 + lane % 4 * 2;
+
+        // The offset to the 1st element computed by that thread in the mask.
+        size_t offset = (mi * warps_m * 16 + row) * b * s + bi * s + col;
+
+        // The mask for each row of MMAs.
+        uint32_t mask = 0u;
+
+        // Iterate over the items.
+        for (size_t ni = 0; ni < mmas_n; ++ni, offset += 16 * warps_n) {
+          mask |= (mask_h[offset + 0 * b * s + 0] == 1.f ? 1u : 0u) << (8 * ni + 0);
+          mask |= (mask_h[offset + 0 * b * s + 1] == 1.f ? 1u : 0u) << (8 * ni + 1);
+          mask |= (mask_h[offset + 8 * b * s + 0] == 1.f ? 1u : 0u) << (8 * ni + 2);
+          mask |= (mask_h[offset + 8 * b * s + 1] == 1.f ? 1u : 0u) << (8 * ni + 3);
+          mask |= (mask_h[offset + 0 * b * s + 8] == 1.f ? 1u : 0u) << (8 * ni + 4);
+          mask |= (mask_h[offset + 0 * b * s + 9] == 1.f ? 1u : 0u) << (8 * ni + 5);
+          mask |= (mask_h[offset + 8 * b * s + 8] == 1.f ? 1u : 0u) << (8 * ni + 6);
+          mask |= (mask_h[offset + 8 * b * s + 9] == 1.f ? 1u : 0u) << (8 * ni + 7);
+        }
+
+        // Store the mask.
+        packed_mask_h[bi * mmas_m * threads_per_cta + mi * threads_per_cta + tidx] = mask;
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void pack_mask_sm70(uint32_t* packed_mask_h, float const* mask_h, size_t const s,
+                                  size_t const b, size_t const mmas_m, size_t const mmas_n,
+                                  size_t const warps_m, size_t const warps_n,
+                                  size_t const threads_per_cta) {
+  // Generate the packed mask. We use a gather approach.
+  for (size_t bi = 0; bi < b; ++bi) {
+    for (size_t mi = 0; mi < mmas_m; ++mi) {
+      for (size_t tidx = 0; tidx < threads_per_cta; ++tidx) {
+        // The warp position.
+        size_t warp = tidx / 32;
+        size_t lane = tidx % 32;
+
+        // The warp index.
+        size_t warp_m = warp % warps_m;
+        size_t warp_n = warp / warps_m;
+
+        // The row/col of the 1st element for that MMA.
+        size_t row = warp_m * 16 + (lane & 0x10) / 2 + (lane & 0x07);
+        size_t col = warp_n * 16 + (lane & 0x08) / 2;
+
+        // The offset to the 1st element computed by that thread in the mask.
+        size_t offset = (mi * warps_m * 16 + row) * b * s + bi * s + col;
+
+        // The mask for each row of MMAs.
+        uint32_t mask = 0u;
+
+        // Iterate over the items.
+        for (size_t ni = 0; ni < mmas_n; ++ni, offset += 16 * warps_n) {
+          mask |= (mask_h[offset + 0] == 1.f ? 1u : 0u) << (8 * ni + 0);
+          mask |= (mask_h[offset + 1] == 1.f ? 1u : 0u) << (8 * ni + 1);
+          mask |= (mask_h[offset + 2] == 1.f ? 1u : 0u) << (8 * ni + 2);
+          mask |= (mask_h[offset + 3] == 1.f ? 1u : 0u) << (8 * ni + 3);
+          mask |= (mask_h[offset + 8] == 1.f ? 1u : 0u) << (8 * ni + 4);
+          mask |= (mask_h[offset + 9] == 1.f ? 1u : 0u) << (8 * ni + 5);
+          mask |= (mask_h[offset + 10] == 1.f ? 1u : 0u) << (8 * ni + 6);
+          mask |= (mask_h[offset + 11] == 1.f ? 1u : 0u) << (8 * ni + 7);
+        }
+
+        // Store the mask.
+        packed_mask_h[bi * mmas_m * threads_per_cta + mi * threads_per_cta + tidx] = mask;
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct RefBMM {
+  RefBMM(cudaDataType_t type_A, cudaDataType_t type_B, cudaDataType_t type_D,
+         cublasComputeType_t type_compute,
+         cudaDataType_t type_scale,  // type for alpha/beta
+         bool A_transposed, bool B_transposed, size_t m, size_t n, size_t k, size_t ldA = 0,
+         size_t ldB = 0, size_t ldD = 0, size_t strideA = 0, size_t strideB = 0, size_t strideD = 0,
+         int batch_count = 1, size_t wsSizeBytes = 4 * 1024 * 1024)
+      : workspaceSizeBytes(wsSizeBytes) {
+    // Compute C_i = A_i x B_i, where matrices are row-major.
+    // as C_i' = B_i' x A_i' where matrices are col-major
+    // We swap them when calling matmul.
+
+    auto op_A = CUBLAS_OP_N;
+    size_t rows_A = m;
+    size_t cols_A = k;
+    if (A_transposed) {
+      std::swap(rows_A, cols_A);
+      op_A = CUBLAS_OP_T;
+    }
+    if (ldA == 0) {  // Default leading dim.
+      ldA = cols_A;
+    }
+    if (strideA == 0) {  // Default batch stride.
+      strideA = rows_A * cols_A;
+    }
+
+    auto op_B = CUBLAS_OP_N;
+    size_t rows_B = k;
+    size_t cols_B = n;
+    if (B_transposed) {
+      std::swap(rows_B, cols_B);
+      op_B = CUBLAS_OP_T;
+    }
+    if (ldB == 0) {  // Default leading dim.
+      ldB = cols_B;
+    }
+    if (strideB == 0) {  // Default batch stride.
+      strideB = rows_B * cols_B;
+    }
+
+    size_t rows_D = m;
+    size_t cols_D = n;
+    if (ldD == 0) {  // Default leading dim.
+      ldD = cols_D;
+    }
+    if (strideD == 0) {  // Default batch stride.
+      strideD = rows_D * cols_D;
+    }
+
+    FMHA_CHECK_CUDA(cudaMalloc(&workspace, wsSizeBytes));
+
+    FMHA_CHECK_CUBLAS(cublasLtCreate(&ltHandle));
+    FMHA_CHECK_CUBLAS(cublasLtMatmulDescCreate(&matmul_desc, type_compute, type_scale));
+
+    FMHA_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA,
+                                                     &op_B, sizeof(op_B)));
+    FMHA_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSB,
+                                                     &op_A, sizeof(op_A)));
+
+    // Need to swap rows <=> cols.
+    FMHA_CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&Adesc, type_A, cols_A, rows_A, ldA));
+    FMHA_CHECK_CUBLAS(cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
+                                                       &batch_count, sizeof(batch_count)));
+    FMHA_CHECK_CUBLAS(cublasLtMatrixLayoutSetAttribute(
+        Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
+
+#if 0
+        printf("A: [%lu x %lu]:%lu\n", rows_A, cols_A, ldA);
+        printf("B: [%lu x %lu]:%lu\n", rows_B, cols_B, ldB);
+        printf("D: [%lu x %lu]:%lu\n", rows_D, cols_D, ldD);
+
+        printf("A: [%lu x %lu]:%lu\n", cols_B, rows_B, ldB);
+        printf("B: [%lu x %lu]:%lu\n", cols_A, rows_A, ldA);
+        printf("D: [%lu x %lu]:%lu\n", cols_D, rows_D, ldD);
+#endif
+
+    // Need to swap rows <=> cols.
+    FMHA_CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&Bdesc, type_B, cols_B, rows_B, ldB));
+    FMHA_CHECK_CUBLAS(cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
+                                                       &batch_count, sizeof(batch_count)));
+    FMHA_CHECK_CUBLAS(cublasLtMatrixLayoutSetAttribute(
+        Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
+
+    // Need to swap rows <=> cols.
+    FMHA_CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&Ddesc, type_D, cols_D, rows_D, ldD));
+    FMHA_CHECK_CUBLAS(cublasLtMatrixLayoutSetAttribute(Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
+                                                       &batch_count, sizeof(batch_count)));
+    FMHA_CHECK_CUBLAS(cublasLtMatrixLayoutSetAttribute(
+        Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
+  }
+
+  ~RefBMM() {
+    FMHA_CHECK_CUBLAS(cublasLtDestroy(ltHandle));
+    FMHA_CHECK_CUBLAS(cublasLtMatmulDescDestroy(matmul_desc));
+    FMHA_CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(Adesc));
+    FMHA_CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(Bdesc));
+    FMHA_CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(Ddesc));
+    FMHA_CHECK_CUDA(cudaFree(workspace));
+  }
+
+  void operator()(void const* A, void const* B, void* D, void const* alpha, void const* beta,
+                  cudaStream_t stream) {
+    cublasStatus_t status =
+        cublasLtMatmul(ltHandle, matmul_desc, alpha, B, Bdesc, A, Adesc, beta, D, Ddesc, D, Ddesc,
+                       nullptr,  // &algo,
+                       workspace, workspaceSizeBytes, stream);
+    FMHA_CHECK_CUBLAS(status);
+  }
+
+  cublasLtHandle_t ltHandle;
+  void* workspace;
+  size_t workspaceSizeBytes;
+
+  cublasLtMatmulDesc_t matmul_desc;
+  cublasLtMatrixLayout_t Adesc;
+  cublasLtMatrixLayout_t Bdesc;
+  cublasLtMatrixLayout_t Ddesc;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void print_results(bool with_colors, bool enabled, bool success = false) {
+  // The opening tag.
+  char beg[16];
+  if (with_colors && enabled && success) {  // Succeeded -> green
+    strcpy(beg, "\033[0;32m");
+  } else if (with_colors && enabled) {  // Failed -> red
+    strcpy(beg, "\033[0;31m");
+  } else if (with_colors) {  // Disabled -> yellow
+    strcpy(beg, "\033[0;33m");
+  }
+
+  // The message.
+  char msg[16];
+  if (enabled && success) {
+    strcpy(msg, "SUCCESS");
+  } else if (enabled) {
+    strcpy(msg, "FAILED");
+  } else {
+    strcpy(msg, "DISABLED");
+  }
+
+  // The closing tag.
+  char end[16];
+  if (with_colors) {
+    strcpy(end, "\033[0m");
+  }
+
+  // Print the results.
+  if (with_colors) {
+    printf("Checks........: %s%s%s\n", beg, msg, end);
+  } else {
+    printf("Checks........: %s\n", msg);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static void print_tensor(float const* buffer, size_t const m, size_t const n, size_t const ld_ = 0,
+                         std::string const& str = "") {
+  printf("Buffer %s:\n", str.c_str());
+  size_t ld = ld_ == 0 ? m : ld_;
+  for (size_t ni = 0; ni < n; ni++) {
+    printf("ni %ld: ", ni);
+    for (size_t mi = 0; mi < m; mi++) {
+      // The offset.
+      size_t ii = (size_t)ni * ld + mi;
+      printf(" %2.3f,", buffer[ii]);
+    }
+    printf("\n");
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static std::pair<int, int> check_softmax_results(float const* out, float const* ref, size_t b,
+                                                 size_t s, size_t h, std::vector<uint32_t>& seqlens,
+                                                 std::vector<int>& cu_seqlens) {
+  int n_errors_max = 0;
+  int n_errors_sum = 0;
+
+  // Check the max
+  for (int b_ = 0; b_ < b; ++b_) {
+    for (int s_ = 0; s_ < seqlens[b_]; ++s_) {
+      for (int h_ = 0; h_ < h; ++h_) {
+        uint64_t idx = (cu_seqlens[b_] + s_) * h * 2 + h_ * 2;
+        float sum = out[idx];
+        float sum_ref = ref[idx];
+        if (sum_ref != 1.0f && fabsf(sum - sum_ref) / (fabsf(sum) + fabsf(sum_ref)) > 0.01) {
+          n_errors_max++;
+        }
+      }
+    }
+  }
+  // Check the sum
+  for (int b_ = 0; b_ < b; ++b_) {
+    for (int s_ = 0; s_ < seqlens[b_]; ++s_) {
+      for (int h_ = 0; h_ < h; ++h_) {
+        uint64_t idx = (cu_seqlens[b_] + s_) * h * 2 + h_ * 2 + 1;
+        float sum = out[idx];
+        float sum_ref = ref[idx];
+        if (sum_ref != 1.0f && fabsf(sum - sum_ref) / (fabsf(sum) + fabsf(sum_ref)) > 0.01) {
+          n_errors_sum++;
+        }
+      }
+    }
+  }
+  return {n_errors_max, n_errors_sum};
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline int check_results(float const* out, float const* ref, size_t m, size_t n, size_t ld,
+                                float epsilon, bool verbose, bool with_colors) {
+  int failed = 0, infs = 0;
+  float min_val = +FLT_MAX, max_val = -FLT_MAX, min_err = +FLT_MAX, max_err = -FLT_MAX;
+  double avg_val = 0.0, sqr_val = 0.0, avg_err = 0.0, sqr_err = 0.0;
+  double inv_mn = 1.0 / (double)m / (double)n;
+  for (size_t ni = 0; ni < n; ++ni) {
+    for (size_t mi = 0; mi < m; ++mi) {
+      // The offset.
+      size_t ii = (size_t)ni * ld + mi;
+
+      // The elements.
+      float a = out[ii];
+      float b = ref[ii];
+
+      // Compute the error.
+      float den = fabsf(a) + fabsf(b);
+      float err = den <= epsilon ? fabsf(a - b) : fabsf(a - b) / den;
+
+      // Min/max values.
+      min_val = fminf(a, min_val);
+      max_val = fmaxf(a, max_val);
+      min_err = fminf(err, min_err);
+      max_err = fmaxf(err, max_err);
+
+      // Sums to compute the average value.
+      avg_val += (double)a * inv_mn;
+      sqr_val += (double)a * a * inv_mn;
+      avg_err += (double)err * inv_mn;
+      sqr_err += (double)err * err * inv_mn;
+
+      // Does it fail?
+      if (isnan(a) || isnan(b) || err > epsilon) {
+        if (failed < 8) {
+          printf("\tInvalid result for ni=%lu (on %lu) mi=%lu (on %lu) ii=%lu:\n", ni, n, mi, m,
+                 ii);
+          printf("\t    Found...: 0x%08x (%10.6f)\n", *(int const*)&out[ii], a);
+          printf("\t    Expected: 0x%08x (%10.6f)\n", *(int const*)&ref[ii], b);
+          printf("\t    Error...: %10.6f\n", err);
+        }
+        failed++;
+      }
+      infs += !isfinite(a);
+      infs += !isfinite(b);
+    }
+  }
+
+  double std_val = sqrtf(sqr_val - avg_val * avg_val);
+  double std_err = sqrtf(sqr_err - avg_err * avg_err);
+
+  if (verbose) {
+    printf("Epsilon.......: %.8f\n", epsilon);
+    printf("Tested........: %lu\n", m * n);
+    printf("Failed........: %d\n", failed);
+    printf("Values........: Min=%12.6f, Max=%12.6f, Avg=%10.6lf, Std=%10.6lf\n", min_val, max_val,
+           avg_val, std_val);
+    printf("Error.........: Min=%12.6f, Max=%12.6f, Avg=%10.6lf, Std=%10.6lf\n", min_err, max_err,
+           avg_err, std_err);
+    printf("Epsilon.......: %.6f\n", epsilon);
+    printf("Infs..........: %d\n", infs);
+    print_results(with_colors, true, !failed);
+  }
+  return failed ? 1 : 0;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+inline T align_to(T m, T n) {
+  return T((m + n - 1) / n) * n;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void convert_to_float_(float* dst, uint16_t const* src, size_t n) {
+  for (size_t ii = 0; ii < n; ++ii) {
+    dst[ii] = __half2float(reinterpret_cast<__half const*>(src)[ii]);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void convert_to_float_(float* dst, int32_t const* src, size_t n) {
+  for (size_t ii = 0; ii < n; ++ii) {
+    dst[ii] = (float)src[ii];
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void convert_to_float_(float* dst, int8_t const* src, size_t n) {
+  for (size_t ii = 0; ii < n; ++ii) {
+    dst[ii] = (float)(int32_t)src[ii];
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void convert_to_float_(float* dst, fmha::bf16_t const* src, size_t n) {
+  for (size_t ii = 0; ii < n; ++ii) {
+    dst[ii] = __bfloat162float(src[ii]);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void convert_to_float_(float* dst, fmha::e4m3_t const* src, size_t n) {
+  for (size_t ii = 0; ii < n; ++ii) {
+    dst[ii] = float(src[ii]);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void convert_to_float(float* dst, void const* src, size_t n, Data_type dtype) {
+  switch (dtype) {
+    case DATA_TYPE_FP32:
+      memcpy(dst, src, n * sizeof(float));
+      break;
+    case DATA_TYPE_FP16:
+      convert_to_float_(dst, reinterpret_cast<uint16_t const*>(src), n);
+      break;
+    case DATA_TYPE_INT32:
+      convert_to_float_(dst, reinterpret_cast<int32_t const*>(src), n);
+      break;
+    case DATA_TYPE_INT8:
+      convert_to_float_(dst, reinterpret_cast<int8_t const*>(src), n);
+      break;
+    case DATA_TYPE_BF16:
+      convert_to_float_(dst, reinterpret_cast<fmha::bf16_t const*>(src), n);
+      break;
+    case DATA_TYPE_E4M3:
+      convert_to_float_(dst, reinterpret_cast<fmha::e4m3_t const*>(src), n);
+      break;
+    default:
+      assert(false);  // Not implemented!
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void convert_from_float_(uint16_t* dst, float const* src, size_t n) {
+  for (size_t ii = 0; ii < n; ++ii) {
+    reinterpret_cast<__half*>(dst)[ii] = __float2half_rn(src[ii]);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void convert_from_float_(fmha::bf16_t* dst, float const* src, size_t n) {
+  for (size_t ii = 0; ii < n; ++ii) {
+    dst[ii] = __float2bfloat16(src[ii]);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void convert_from_float_(int32_t* dst, float const* src, size_t n) {
+  for (size_t ii = 0; ii < n; ++ii) {
+    dst[ii] = (int32_t)src[ii];
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void convert_from_float_(int8_t* dst, float const* src, size_t n) {
+  for (size_t ii = 0; ii < n; ++ii) {
+    float x = src[ii];
+    dst[ii] = (int8_t)(int32_t)(x < -128.f ? -128.f : (x > 127.f ? 127.f : x));
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void convert_from_float_(fmha::e4m3_t* dst, float const* src, size_t n) {
+  for (size_t ii = 0; ii < n; ++ii) {
+    dst[ii] = fmha::e4m3_t(src[ii]);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void convert_from_float(void* dst, float const* src, size_t n, Data_type dtype) {
+  switch (dtype) {
+    case DATA_TYPE_FP32:
+      memcpy(dst, src, n * sizeof(float));
+      break;
+    case DATA_TYPE_FP16:
+      convert_from_float_(reinterpret_cast<uint16_t*>(dst), src, n);
+      break;
+    case DATA_TYPE_BF16:
+      convert_from_float_(reinterpret_cast<fmha::bf16_t*>(dst), src, n);
+      break;
+    case DATA_TYPE_INT32:
+      convert_from_float_(reinterpret_cast<int32_t*>(dst), src, n);
+      break;
+    case DATA_TYPE_INT8:
+      convert_from_float_(reinterpret_cast<int8_t*>(dst), src, n);
+      break;
+    case DATA_TYPE_E4M3:
+      convert_from_float_(reinterpret_cast<fmha::e4m3_t*>(dst), src, n);
+      break;
+    default:
+      assert(false);  // Not implemented!
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline cudaError cuda_memcpy_d2h(float* dst, void const* src, size_t n, Data_type dtype) {
+  size_t sz = get_size_in_bytes(n, dtype);
+  void* tmp = malloc(sz);
+  cudaError err = cudaMemcpy(tmp, src, sz, cudaMemcpyDeviceToHost);
+  convert_to_float(dst, tmp, n, dtype);
+  free(tmp);
+  return err;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline cudaError cuda_memcpy_h2d(void* dst, float const* src, size_t n, Data_type dtype) {
+  size_t sz = get_size_in_bytes(n, dtype);
+  void* tmp = malloc(sz);
+  convert_from_float(tmp, src, n, dtype);
+  cudaError err = cudaMemcpy(dst, tmp, sz, cudaMemcpyHostToDevice);
+  free(tmp);
+  return err;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline cudaDataType_t data_type_to_cuda(Data_type dtype) {
+  switch (dtype) {
+    case DATA_TYPE_FP32:
+      return CUDA_R_32F;
+    case DATA_TYPE_FP16:
+      return CUDA_R_16F;
+    case DATA_TYPE_INT32:
+      return CUDA_R_32I;
+    case DATA_TYPE_INT8:
+      return CUDA_R_8I;
+    case DATA_TYPE_BF16:
+      return CUDA_R_16BF;
+#if FMHA_CUDA_SUPPORTS_FP8
+    case DATA_TYPE_E4M3:
+      return CUDA_R_8F_E4M3;
+    case DATA_TYPE_E5M2:
+      return CUDA_R_8F_E5M2;
+#endif
+    default:
+      assert(false);
+      return CUDA_R_32F;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline std::string data_type_to_name(Data_type dtype) {
+  switch (dtype) {
+    case DATA_TYPE_FP32:
+      return "FP32";
+    case DATA_TYPE_FP16:
+      return "FP16";
+    case DATA_TYPE_INT32:
+      return "INT32";
+    case DATA_TYPE_INT8:
+      return "INT8";
+    case DATA_TYPE_BF16:
+      return "BF16";
+#if FMHA_CUDA_SUPPORTS_FP8
+    case DATA_TYPE_E4M3:
+      return "FP8_E4M3";
+    case DATA_TYPE_E5M2:
+      return "FP8_E5M2";
+#endif
+    default:
+      assert(false);
+      return "";
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline cublasComputeType_t data_type_to_cublas(Data_type dtype) {
+  switch (dtype) {
+    case DATA_TYPE_FP32:
+      return CUBLAS_COMPUTE_32F;
+    case DATA_TYPE_FP16:
+      return CUBLAS_COMPUTE_16F;
+    case DATA_TYPE_INT32:
+      return CUBLAS_COMPUTE_32I;
+    // case DATA_TYPE_BF16:
+    //     //TODO HACK!!
+    //     return CUBLAS_COMPUTE_32F;
+    default:
+      assert(false);
+      return CUBLAS_COMPUTE_32F;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/fmha_v2/fused_multihead_cross_attention.cpp b/csrc/fmha_v2/fused_multihead_cross_attention.cpp
new file mode 100644
index 0000000000..cc6b7548be
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_cross_attention.cpp
@@ -0,0 +1,939 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <float.h>
+#include <fused_multihead_cross_attention_api.h>
+#include <math.h>
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+using Launch_params = bert::Fused_multihead_attention_launch_params;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_softmax_fp32(void* dst, void const* src, void const* mask, void const* attention_sinks,
+                      void* softmax_sum_d, void* cu_seqlens_q_d, int s_inner, int s_outer, int b,
+                      int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_softmax_e4m3(void* dst, void const* src, void const* mask, void const* attention_sinks,
+                      void* softmax_sum_d, void* cu_seqlens_q_d, int s_inner, int s_outer, int b,
+                      int h, float scale_softmax, float softcapping_scale_bmm1, int warps_n,
+                      bool has_alibi);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_softmax_fp16(void* dst, void const* src, void const* mask, void const* attention_sinks,
+                      void* softmax_sum_d, void* cu_seqlens_q_d, int s_inner, int s_outer, int b,
+                      int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_softmax_int8(void* dst, void const* src, void const* mask, void const* attention_sinks,
+                      void* softmax_sum_d, void* cu_seqlens_q_d, int s_inner, int s_outer, int b,
+                      int h, float scale_i2f, float scale_f2i, float softcapping_scale_bmm1,
+                      int warps_n, bool has_alibi);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_conversion_int32_to_int8(void* dst, void const* src, int s, int b, int h, int d,
+                                  float scale);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_conversion_fp32_to_fp16(void* dst, void const* src, int s, int b, int h, int d);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_conversion_fp32_to_e4m3(void* dst, void const* src, int s, int b, int h, int d,
+                                 float scale_o);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void ground_truth(RefBMM& bmm1, RefBMM& bmm2, const Data_type data_type, const Data_type acc_type,
+                  float const scale_bmm1, float const scale_softmax, float const scale_bmm2,
+                  void* q_d, void* kv_d, void* vt_d, void* mask_d, void* p_d, void* s_d,
+                  void* tmp_d, void* o_d, void* softmax_sum_d, void* cu_seqlens_q_d, const size_t b,
+                  const size_t s_q, const size_t s_kv, const size_t h, const size_t d,
+                  int const runs, int const warps_m, int const warps_n, bool has_alibi) {
+  cudaStream_t stream = 0;
+  // The stride between rows of the QKV matrix.
+  size_t qkv_stride = get_size_in_bytes(d, data_type);
+
+  // 1st GEMMd.
+  uint32_t alpha, beta = 0u;
+
+  for (int ii = 0; ii < runs; ++ii) {
+    // If we run the INT8 kernel, defer the scaling of P to softmax.
+    set_alpha(alpha, data_type == DATA_TYPE_INT8 ? 1.f : scale_bmm1, acc_type);
+
+    // P = Q x K'
+    bmm1(static_cast<char*>(q_d) + 0 * qkv_stride, static_cast<char*>(kv_d) + 0 * qkv_stride, p_d,
+         &alpha, &beta, stream);
+
+    // Softmax.
+    if (data_type == DATA_TYPE_FP16 && acc_type == DATA_TYPE_FP16) {
+      run_softmax_fp16(s_d, p_d, mask_d, nullptr, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h,
+                       0.f, warps_n, has_alibi);
+    } else if (data_type == DATA_TYPE_FP16 && acc_type == DATA_TYPE_FP32) {
+      run_softmax_fp32(s_d, p_d, mask_d, nullptr, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h,
+                       0.f, warps_n, has_alibi);
+    } else if (data_type == DATA_TYPE_E4M3 && acc_type == DATA_TYPE_FP32) {
+      run_softmax_e4m3(s_d, p_d, mask_d, nullptr, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h,
+                       scale_softmax, 0.f, warps_n, has_alibi);
+    } else if (data_type == DATA_TYPE_INT8 && acc_type == DATA_TYPE_INT32) {
+      run_softmax_int8(s_d, p_d, mask_d, nullptr, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h,
+                       scale_bmm1, scale_softmax, 0.f, warps_n, has_alibi);
+    } else {
+      assert(false && "Reference Softmax: Unsupported type config");
+    }
+
+    // 2nd GEMM.
+    set_alpha(alpha, 1.f, acc_type);
+
+    void* out_d = o_d;
+
+    // We may have to do a final conversion.
+    if (data_type != acc_type) {
+      out_d = tmp_d;
+    }
+
+    // O = S x V
+    bmm2(static_cast<char*>(s_d),
+         static_cast<char*>(vt_d),  // static_cast<char *>(qkv_d) + 2 * qkv_stride,
+         out_d, &alpha, &beta, stream);
+
+    // Conversion to output type.
+    if (data_type == DATA_TYPE_FP16 && acc_type == DATA_TYPE_FP16) {
+      // Noop.
+    } else if (data_type == DATA_TYPE_FP16 && acc_type == DATA_TYPE_FP32) {
+      run_conversion_fp32_to_fp16(o_d, out_d, s_q, b, h, d);
+    } else if (data_type == DATA_TYPE_E4M3 && acc_type == DATA_TYPE_FP32) {
+      run_conversion_fp32_to_e4m3(o_d, out_d, s_q, b, h, d, scale_bmm2);
+    } else if (data_type == DATA_TYPE_INT8 && acc_type == DATA_TYPE_INT32) {
+      // quantize output in second step
+      run_conversion_int32_to_int8(o_d, out_d, s_q, b, h, d, scale_bmm2);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void set_params(bert::Fused_multihead_attention_params_mhca& params,
+                              // types
+                              Data_type data_type, Data_type acc_type,
+                              // sizes
+                              const size_t b, const size_t s_q, const size_t s_kv, const size_t h,
+                              const size_t d, const size_t d_padded, const size_t total,
+                              // device pointers
+                              void* q_packed_d, void* kv_packed_d, void* cu_seqlens_q_d,
+                              void* cu_seqlens_kv_d, void* o_packed_d, void* p_d, void* s_d,
+                              // scale factors
+                              float const scale_bmm1, float const scale_softmax,
+                              float const scale_bmm2,
+                              // flags
+                              bool const use_int8_scale_max) {
+  memset(&params, 0, sizeof(params));
+
+  // Set the pointers.
+  params.o_ptr = o_packed_d;
+  params.o_stride_in_bytes = get_size_in_bytes(h * d, data_type);
+
+  // if( interleaved ) {
+  //     params.qkv_stride_in_bytes = total;
+  //     params.o_stride_in_bytes = total;
+  // }
+
+#if defined(STORE_P)
+  params.p_ptr = p_d;
+  params.p_stride_in_bytes = get_size_in_bytes(b * h * s_kv, acc_type);
+#endif  // defined(STORE_P)
+
+#if defined(STORE_S)
+  params.s_ptr = s_d;
+  params.s_stride_in_bytes = get_size_in_bytes(b * h * s_kv, data_type);
+#endif  // defined(STORE_S)
+
+  // Set the dimensions.
+  params.b = b;
+  params.h = h;
+  params.s_q = s_q;
+  params.s = s_kv;
+  params.d = d;
+  params.d_padded = d_padded;
+
+  // Set the different scale values.
+  Data_type scale_type1 = data_type == DATA_TYPE_FP16 ? acc_type : DATA_TYPE_FP32;
+  Data_type scale_type2 = data_type == DATA_TYPE_FP16 ? DATA_TYPE_FP16 : DATA_TYPE_FP32;
+
+  set_alpha(params.scale_bmm1, scale_bmm1, scale_type1);
+  set_alpha(params.scale_softmax, scale_softmax, scale_type1);
+  set_alpha(params.scale_bmm2, scale_bmm2, scale_type2);
+
+  // Set the pointers.
+  params.gmem_q_params.ptr = q_packed_d;
+  params.gmem_q_params.stride_in_bytes = get_size_in_bytes(h * d, data_type);
+  params.gmem_q_params.h = h;
+  params.gmem_q_params.d = d;
+  params.gmem_q_params.cu_seqlens = static_cast<int*>(cu_seqlens_q_d);
+
+  params.gmem_kv_params.ptr = kv_packed_d;
+  params.gmem_kv_params.stride_in_bytes = get_size_in_bytes(h * 2 * d, data_type);
+  params.gmem_kv_params.h = h;
+  params.gmem_kv_params.d = d;
+  params.gmem_kv_params.cu_seqlens = static_cast<int*>(cu_seqlens_kv_d);
+
+  // Set flags
+  params.use_int8_scale_max = use_int8_scale_max;
+
+  // Do we enable the trick to replace I2F with FP math in the 2nd GEMM?
+  if (data_type == DATA_TYPE_INT8) {
+    params.enable_i2f_trick = -double(1 << 22) * double(scale_bmm2) <= -128.f &&
+                              double(1 << 22) * double(scale_bmm2) >= 127.f;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char** argv) {
+  // The device. Reset on destruction
+  CudaDevice device;
+  int sm = device.sm;
+  cudaDeviceProp props = device.props;
+
+  GpuTimer timer;
+
+  // The batch size.
+  size_t b = 128;
+  // The number of heads.
+  size_t h = 16;
+  // The dimension of the Q, K and V vectors.
+  size_t d = 64;
+  // The length of the sequence for query tokens
+  size_t s_q = 4096;
+  // The length of the sequence for K/V cross attention tokens
+  size_t s_kv = 77;
+
+  // The data type of the kernel.
+  Data_type data_type = DATA_TYPE_FP16;
+  // The type of the intermediate P matrix.
+  Data_type acc_type = DATA_TYPE_FP16;
+  // The scaling factors.
+  float scale_bmm1 = 0.f, scale_softmax = 0.f, scale_bmm2 = 0.25f;
+  // The number of runs.
+  int runs = 1, warm_up_runs = 0;
+  // Do we use 1s for Q, K, V.
+  bool use_1s_q = false, use_1s_k = false, use_1s_v = false, use_1s_mask = false;
+  // The range of the different inputs.
+  int range_q = 5, range_k = 3, range_v = 5;
+  // The scale.
+  float scale_q = 0.f, scale_k = 0.f, scale_v = 0.f;
+  // The threshold for dropout. By default, drop 10%.
+  float dropout = 0.1f;
+  // Do we skip the checks.
+  bool skip_checks = false;
+  // The tolerance when checking results.
+  float epsilon = -1.f;  // data_type == DATA_TYPE_FP16 ? 0.015f : 0.f;
+
+  // minimum sequence length for sampling variable seqlens
+  uint32_t min_s = s_q;
+
+  // run interleaved kernels and transpose input and output accordingly
+  bool interleaved = false;
+  bool ignore_b1opt = false;
+  bool force_unroll = true;
+  bool use_int8_scale_max = false;
+  bool verbose = true;
+
+  // set all sequence lengths to min(s, min_s)
+  bool fix_s = true;
+
+  bool v1 = false;
+
+  // use TMA or not. ignored if not in SM90
+  bool use_tma = false;
+
+  // Read the parameters from the command-line.
+  for (int ii = 1; ii < argc; ++ii) {
+    if (!strcmp(argv[ii], "-1s")) {
+      use_1s_k = use_1s_q = use_1s_v = use_1s_mask = true;
+    } else if (!strcmp(argv[ii], "-1s-k")) {
+      use_1s_k = true;
+    } else if (!strcmp(argv[ii], "-1s-mask")) {
+      use_1s_mask = true;
+    } else if (!strcmp(argv[ii], "-1s-q")) {
+      use_1s_q = true;
+    } else if (!strcmp(argv[ii], "-1s-v")) {
+      use_1s_v = true;
+    } else if (!strcmp(argv[ii], "-b") && ++ii < argc) {
+      b = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-d") && ++ii < argc) {
+      d = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-dropout") && ++ii < argc) {
+      dropout = (float)strtod(argv[ii], nullptr);
+    } else if (!strcmp(argv[ii], "-epsilon") && ++ii < argc) {
+      epsilon = (float)strtod(argv[ii], nullptr);
+    } else if (!strcmp(argv[ii], "-h") && ++ii < argc) {
+      h = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-int8")) {
+      data_type = DATA_TYPE_INT8;
+      acc_type = DATA_TYPE_INT32;
+    } else if (!strcmp(argv[ii], "-fp16")) {
+      data_type = DATA_TYPE_FP16;
+      acc_type = DATA_TYPE_FP16;
+    } else if (!strcmp(argv[ii], "-e4m3")) {
+      data_type = DATA_TYPE_E4M3;
+      // Technically not the acc type.
+      acc_type = DATA_TYPE_FP32;
+    } else if (!strcmp(argv[ii], "-range-k") && ++ii < argc) {
+      range_k = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-range-q") && ++ii < argc) {
+      range_q = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-range-v") && ++ii < argc) {
+      range_v = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-runs") && ++ii < argc) {
+      runs = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-s-q") && ++ii < argc) {
+      s_q = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-s-kv") && ++ii < argc) {
+      s_kv = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-scale-bmm1") && ++ii < argc) {
+      scale_bmm1 = (float)strtod(argv[ii], nullptr);
+    } else if (!strcmp(argv[ii], "-scale-bmm2") && ++ii < argc) {
+      scale_bmm2 = (float)strtod(argv[ii], nullptr);
+    } else if (!strcmp(argv[ii], "-scale-k") && ++ii < argc) {
+      scale_k = (float)strtod(argv[ii], nullptr);
+    } else if (!strcmp(argv[ii], "-scale-softmax") && ++ii < argc) {
+      scale_softmax = (float)strtod(argv[ii], nullptr);
+    } else if (!strcmp(argv[ii], "-scale-q") && ++ii < argc) {
+      scale_q = (float)strtod(argv[ii], nullptr);
+    } else if (!strcmp(argv[ii], "-scale-v") && ++ii < argc) {
+      scale_v = (float)strtod(argv[ii], nullptr);
+    } else if (!strcmp(argv[ii], "-skip-checks")) {
+      skip_checks = true;
+    } else if (!strcmp(argv[ii], "-warm-up-runs") && ++ii < argc) {
+      warm_up_runs = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-min-s") && ++ii < argc) {
+      min_s = strtol(argv[ii], nullptr, 10);
+    } else if (!strcmp(argv[ii], "-il")) {
+      interleaved = true;
+    } else if (!strcmp(argv[ii], "-ignore-b1opt")) {
+      ignore_b1opt = true;
+    } else if (!strcmp(argv[ii], "-scale-max")) {
+      use_int8_scale_max = true;
+    } else if (!strcmp(argv[ii], "-v") && ++ii < argc) {
+      int v = strtol(argv[ii], nullptr, 10);
+      verbose = v != 0;
+    } else if (!strcmp(argv[ii], "-use-tma")) {
+      use_tma = true;
+    } else {
+      fprintf(stderr, "Unrecognized option: %s. Aborting!\n", argv[ii]);
+      return -1;
+    }
+  }
+
+  if (interleaved) {
+    throw std::runtime_error("Interleaved layout is not supported!");
+  }
+
+  min_s = std::min<uint32_t>(s_q, min_s);
+
+  // The padded sizes.
+  int const s_kv_padded = std::pow(2, std::ceil(std::log(s_kv) / std::log(2)));
+  int const d_padded = std::pow(2, std::ceil(std::log(d) / std::log(2)));
+
+  // Set the norm.
+  if (scale_bmm1 == 0.f) {
+    scale_bmm1 = 1.f / sqrtf((float)d);
+  }
+
+  // Force the softmax scale to 1.f for the FP16 kernel.
+  if (data_type == DATA_TYPE_FP16) {
+    scale_softmax = 1.f;
+  } else if (data_type == DATA_TYPE_INT8 && scale_softmax == 0.f) {
+    scale_softmax = std::max(512.f, (float)s_kv);
+  }
+
+  // Define the scaling factor for the different inputs.
+  if (scale_q == 0.f) {
+    scale_q = 1.f;
+  }
+  if (scale_k == 0.f) {
+    scale_k = 1.f;
+  }
+  if (scale_v == 0.f) {
+    scale_v = data_type == DATA_TYPE_FP16 ? 0.125f : 1.f;
+  }
+
+  // Set the tolerance if not already set by the user.
+  if (epsilon < 0.f) {
+    epsilon = data_type == DATA_TYPE_FP16 ? 0.015f : 0.f;
+  }
+
+  // Debug info -- only in verbose mode.
+  if (verbose) {
+    // Running the following command.
+    printf("Command.......: %s", argv[0]);
+    for (int ii = 1; ii < argc; ++ii) {
+      printf(" %s", argv[ii]);
+    }
+    printf("\n");
+
+    // Device info.
+    printf("Device........: %s\n", props.name);
+    printf("Arch.(sm).....: %d\n", sm);
+    printf("#.of.SMs......: %d\n", props.multiProcessorCount);
+
+    // Problem info.
+    printf("Batch ........: %lu\n", b);
+    printf("Heads ........: %lu\n", h);
+    printf("Dimension ....: %lu\n", d);
+    printf("Seq len Q ....: %lu\n", s_q);
+    printf("Seq len KV ...: %lu\n", s_kv);
+    printf("Warm-up runs .: %d\n", warm_up_runs);
+    printf("Runs..........: %d\n\n", runs);
+
+    // The scaling factors for the 3 operations.
+    printf("Scale bmm1 ...: %.6f\n", scale_bmm1);
+    printf("Scale softmax.: %.6f\n", scale_softmax);
+    printf("Scale bmm2 ...: %.6f\n", scale_bmm2);
+    printf("\n");
+  }
+
+  Launch_params launch_params;
+  // Set launch params to choose kernels
+  launch_params.interleaved = interleaved;
+  launch_params.ignore_b1opt = ignore_b1opt;
+  launch_params.force_unroll = force_unroll;
+  launch_params.use_tma = use_tma;
+
+  // The Q matrix of size S_Q x B x H x D.
+  const size_t q_size = s_q * b * h * d;
+  // The K and V matrices are packed into one big matrix of size S_KV x B x H x 2 x D.
+  const size_t kv_size = s_kv_padded * b * h * 2 * d;
+  // Allocate on the host.
+  float* q_h = (float*)malloc(q_size * sizeof(float));
+  // Allocate on the host.
+  float* kv_h = (float*)malloc(kv_size * sizeof(float));
+  // The size in bytes.
+  const size_t q_size_in_bytes = get_size_in_bytes(q_size, data_type);
+  // The size in bytes.
+  const size_t kv_size_in_bytes = get_size_in_bytes(kv_size, data_type);
+  // Allocate on the device.
+  void* q_d = nullptr;
+  FMHA_CHECK_CUDA(cudaMalloc(&q_d, q_size_in_bytes));
+  // Allocate on the device.
+  void* kv_d = nullptr;
+  FMHA_CHECK_CUDA(cudaMalloc(&kv_d, kv_size_in_bytes));
+
+  // The mask for dropout.
+  const size_t mask_size = s_q * b * s_kv_padded;
+  // Allocate on the host.
+  float* mask_h = (float*)malloc(mask_size * sizeof(float));
+  // The size in bytes.
+  const size_t mask_size_in_bytes = get_size_in_bytes(mask_size, DATA_TYPE_INT8);
+  // Allocate on the device.
+  void* mask_d = nullptr;
+  FMHA_CHECK_CUDA(cudaMalloc(&mask_d, mask_size_in_bytes));
+
+  // The decomposition of threads and warps for BMM1.
+  size_t warps_m, warps_n, warps_k;
+  std::tie(warps_m, warps_n, warps_k) =
+      get_warps(launch_params, sm, data_type, s_kv_padded, b, d_padded, v1 ? 1 : 2);
+
+  // For multi-CTA cases, determine the size of the CTA wave.
+  int heads_per_wave, ctas_per_head;
+  get_grid_size(heads_per_wave, ctas_per_head, sm, data_type, b, s_kv_padded, h, d,
+                false,  // disable multi-cta kernels by default
+                v1 ? 1 : 2);
+
+  // The number of threads per CTA.
+  const size_t threads_per_cta = warps_m * warps_n * warps_k * 32;
+  // The number of mmas in the M dimension. We use one uint32_t per MMA in the M dimension.
+  const size_t mmas_m = (s_q + 16 * warps_m - 1) / (16 * warps_m);
+  // The number of mmas in the N dimension.
+  const size_t mmas_n = (s_kv_padded + 16 * warps_n - 1) / (16 * warps_n);
+  // We do not support more than 4 MMAS in the N dimension (as each MMA needs 8 bits in the mask).
+  assert(!v1 || mmas_n <= 4);
+  // The packed mask for dropout (in the fused kernel). Layout is B * MMAS_M * THREADS_PER_CTA.
+  const size_t packed_mask_size = b * mmas_m * threads_per_cta;
+  // The size in bytes.
+  const size_t packed_mask_size_in_bytes = packed_mask_size * sizeof(uint32_t);
+  // Allocate on the host.
+  uint32_t* packed_mask_h = (uint32_t*)malloc(packed_mask_size_in_bytes);
+  // Allocate on the device.
+  void* packed_mask_d = nullptr;
+
+  // The O matrix is packed as S_Q * B * H * D.
+  const size_t o_size = s_q * b * h * d;
+  // Allocate on the host.
+  float* o_h = (float*)malloc(o_size * sizeof(float));
+  // The size in bytes.
+  const size_t o_size_in_bytes = get_size_in_bytes(o_size, data_type);
+  // Allocate on the device.
+  void* o_d = nullptr;
+  FMHA_CHECK_CUDA(cudaMalloc(&o_d, o_size_in_bytes));
+  void* softmax_sum_d;
+  FMHA_CHECK_CUDA(cudaMalloc(&softmax_sum_d, sizeof(float) * b * s_q * h));
+  FMHA_CHECK_CUDA(cudaMemset(softmax_sum_d, 0x00, sizeof(float) * b * s_q * h));
+  void* softmax_max_d;
+  FMHA_CHECK_CUDA(cudaMalloc(&softmax_max_d, sizeof(float) * b * s_q * h));
+  FMHA_CHECK_CUDA(cudaMemset(softmax_max_d, 0x00, sizeof(float) * b * s_q * h));
+
+  // The size in bytes.
+  const size_t tmp_size_in_bytes = get_size_in_bytes(o_size, acc_type);
+  // Allocate on the device.
+  void* tmp_d = nullptr;
+  if (data_type != acc_type) {
+    FMHA_CHECK_CUDA(cudaMalloc(&tmp_d, tmp_size_in_bytes));
+  }
+
+  // Allocate the reference on the host.
+  float* o_ref_h = (float*)malloc(o_size * sizeof(float));
+
+  // The P matrix is stored as one big matrix of size S_Q x B x H x S_KV.
+  const size_t p_size = s_q * b * h * s_kv_padded;
+  // The size in bytes.
+  const size_t p_size_in_bytes = get_size_in_bytes(p_size, acc_type);
+  // Allocate on the device.
+  void* p_d = nullptr;
+  FMHA_CHECK_CUDA(cudaMalloc(&p_d, p_size_in_bytes));
+
+  // Allocate the reference on the host.
+  float* p_ref_h = (float*)malloc(p_size * sizeof(float));
+#if defined(STORE_P)
+  // Allocate on the host.
+  float* p_h = (float*)malloc(p_size * sizeof(float));
+#endif  // defined(STORE_P)
+
+  // The size in bytes of the S matrix (the data type may be different from P for int8).
+  const size_t s_size_in_bytes = get_size_in_bytes(p_size, data_type);
+  // Allocate on the device.
+  void* s_d = nullptr;
+  FMHA_CHECK_CUDA(cudaMalloc(&s_d, s_size_in_bytes));
+
+  // Allocate the reference on the host.
+  float* s_ref_h = (float*)malloc(p_size * sizeof(float));
+
+  // Allocate on the host.
+  float* s_h = (float*)malloc(p_size * sizeof(float));
+  // Make sure we set the seed for reproducible results.
+  srand(1234UL);
+
+  // Set the Q, K and V matrices.
+  random_init("Q", q_h, d, s_q * b * h, d, use_1s_q, range_q, scale_q, verbose);
+  random_init("K", kv_h + 0 * d, d, s_kv_padded * b * h, 2 * d, use_1s_k, range_k, scale_k,
+              verbose);
+  random_init("V", kv_h + 1 * d, d, s_kv_padded * b * h, 2 * d, use_1s_v, range_v, scale_v,
+              verbose);
+
+  //   WAR fOR MISSING CUBLAS FP8 NN SUPPORT.
+  //   Transpose V, so that we can do a TN BMM2, i.e. O = S x V'  instead of O = S x V.
+  const size_t v_size = s_kv_padded * b * h * d;
+  // The size in bytes.
+  const size_t v_size_in_bytes = get_size_in_bytes(v_size, data_type);
+  float* vt_h = (float*)malloc(v_size * sizeof(float));
+  void* vt_d = nullptr;
+  FMHA_CHECK_CUDA(cudaMalloc(&vt_d, v_size_in_bytes));
+  for (size_t it = 0; it < v_size; it++) {
+    // vt is B x H x D x S_KV
+    size_t si = it % s_kv_padded;
+    size_t di = (it / s_kv_padded) % d;
+    size_t hi = ((it / s_kv_padded) / d) % h;
+    size_t bi = (((it / s_kv_padded) / d) / h) % b;
+    // kv is S_KV x B x H x 2 x D
+    size_t kv_idx = si * b * h * 2 * d + bi * h * 2 * d + hi * 2 * d + 1 * d  // index V here
+                    + di;
+    vt_h[it] = kv_h[kv_idx];
+  }
+  FMHA_CHECK_CUDA(cuda_memcpy_h2d(vt_d, vt_h, v_size, data_type));
+
+  // // DEBUG.
+  // float sum = 0.f;
+  // for( size_t si = 0; si < s; ++si ) {
+  //   float v = qkv_h[si*b*h*3*d + 2*d];
+  //   printf("V[%3d]=%8.3f\n", si, v);
+  //   sum += v;
+  // }
+  // printf("Sum of V = %8.3f\n", sum);
+  // // END OF DEBUG.
+
+  // Copy from the host to the device.
+  FMHA_CHECK_CUDA(cuda_memcpy_h2d(q_d, q_h, q_size, data_type));
+  FMHA_CHECK_CUDA(cuda_memcpy_h2d(kv_d, kv_h, kv_size, data_type));
+
+  // Create the buffer of mask.
+  // if(verbose) {printf("Init .........: mask\n"); }
+  // random_init_with_zeroes_or_ones(mask_h, b*s, use_1s_mask, 1.f - dropout, verbose);
+
+  auto const create_seqlen = [min_s, fix_s, b](int s, std::vector<uint32_t>& seqlens,
+                                               std::vector<int>& cu_seqlens, void** cu_seqlens_d) {
+    std::transform(seqlens.begin(), seqlens.end(), seqlens.begin(), [=](const uint32_t) {
+      if (fix_s) {
+        return std::min(uint32_t(s), min_s);
+      }
+      throw std::runtime_error("Not supported");
+      // if( s_q == min_s ) {
+      //     return min_s;
+      // }
+      // uint32_t s_ = s_q - min_s + 1;
+      // uint32_t ret = min_s + (rand() % s_);
+      // assert(ret <= s_q);
+      // return ret;
+    });
+
+    // Compute the prefix sum of the sequence lengths.
+    for (int it = 0; it < b; it++) {
+      cu_seqlens[it + 1] = cu_seqlens[it] + seqlens[it];
+    }
+
+    FMHA_CHECK_CUDA(cudaMalloc(cu_seqlens_d, sizeof(int) * cu_seqlens.size()));
+    FMHA_CHECK_CUDA(cudaMemcpy(*cu_seqlens_d, cu_seqlens.data(), sizeof(int) * cu_seqlens.size(),
+                               cudaMemcpyHostToDevice));
+  };
+
+  std::vector<uint32_t> seqlens_q(b, 0);  // randomly draw a batch of sequence lengths >= min_s
+  std::vector<int> cu_seqlens_q(b + 1, 0);
+  // transfer to device
+  void* cu_seqlens_q_d;
+
+  std::vector<uint32_t> seqlens_kv(b, 0);  // randomly draw a batch of sequence lengths >= min_s
+  std::vector<int> cu_seqlens_kv(b + 1, 0);
+  // transfer to device
+  void* cu_seqlens_kv_d;
+
+  create_seqlen(s_q, seqlens_q, cu_seqlens_q, &cu_seqlens_q_d);
+  int total_q = cu_seqlens_q.back();
+  create_seqlen(s_kv, seqlens_kv, cu_seqlens_kv, &cu_seqlens_kv_d);
+  int total_kv = cu_seqlens_kv.back();
+
+  size_t q_packed_size = cu_seqlens_q.back() * h * d;
+  size_t kv_packed_size = cu_seqlens_kv.back() * h * 2 * d;
+  size_t q_packed_size_in_bytes = get_size_in_bytes(q_packed_size, data_type);
+  size_t kv_packed_size_in_bytes = get_size_in_bytes(kv_packed_size, data_type);
+  void* q_packed_d = nullptr;
+  FMHA_CHECK_CUDA(cudaMalloc(&q_packed_d, q_packed_size_in_bytes));
+
+  void* kv_packed_d = nullptr;
+  FMHA_CHECK_CUDA(cudaMalloc(&kv_packed_d, kv_packed_size_in_bytes));
+
+  const size_t o_packed_size = cu_seqlens_q.back() * h * d;
+  // Allocate on the host.
+  float* o_packed_h = (float*)malloc(o_packed_size * sizeof(float));
+  float* o_ref_packed_h = (float*)malloc(o_packed_size * sizeof(float));
+  void* o_packed_d = nullptr;
+
+  size_t o_packed_size_in_bytes = get_size_in_bytes(o_packed_size, data_type);
+  FMHA_CHECK_CUDA(cudaMalloc(&o_packed_d, o_packed_size_in_bytes));
+
+  std::vector<float> kv_packed_h(kv_packed_size);
+  extract_and_transpose_input<float>(kv_packed_h.data(), kv_h, seqlens_kv, s_kv_padded, b, h, d, 2);
+  if (interleaved) {
+    x_vec32(true, kv_packed_h.data(), h, total_kv, 2);
+  }
+
+  std::vector<float> q_packed_h(q_packed_size);
+  extract_and_transpose_input<float>(q_packed_h.data(), q_h, seqlens_q, s_q, b, h, d, 1);
+  if (interleaved) {
+    x_vec32(true, q_packed_h.data(), h, total_q, 1);
+  }
+
+  // printf("%f %f\n", qkv_packed_h[0], qkv_h[0]);
+  FMHA_CHECK_CUDA(cuda_memcpy_h2d(q_packed_d, q_packed_h.data(), q_packed_size, data_type));
+  FMHA_CHECK_CUDA(cuda_memcpy_h2d(kv_packed_d, kv_packed_h.data(), kv_packed_size, data_type));
+
+  for (size_t so = 0; so < s_q; ++so) {
+    for (size_t bi = 0; bi < b; ++bi) {
+      int actual_seqlen_q = seqlens_q[bi];
+      int actual_seqlen_kv = seqlens_kv[bi];
+      for (size_t si = 0; si < s_kv_padded; ++si) {
+        // Are both the query and the key inside the sequence?
+        bool valid = si < actual_seqlen_kv && so < actual_seqlen_q;
+        // The mask is stored as floats.
+        mask_h[so * b * s_kv_padded + bi * s_kv_padded + si] = valid ? 1.f : 0.f;
+      }
+    }
+  }
+
+  // Copy the mask to the device.
+  FMHA_CHECK_CUDA(cuda_memcpy_h2d(mask_d, mask_h, mask_size, DATA_TYPE_INT8));
+
+  // Set the params.
+  bert::Fused_multihead_attention_params_mhca params;
+  set_params(params, data_type, acc_type, b, s_q, s_kv_padded, h, d, d_padded, total_kv, q_packed_d,
+             kv_packed_d, cu_seqlens_q_d, cu_seqlens_kv_d, o_packed_d, p_d, s_d, scale_bmm1,
+             scale_softmax, scale_bmm2, use_int8_scale_max);
+
+  // Allocate barriers and locks.
+  void* counters_d = nullptr;
+  if (ctas_per_head > 1) {
+    size_t sz = heads_per_wave * sizeof(int);
+    FMHA_CHECK_CUDA(cudaMalloc((void**)&counters_d, 3 * sz));
+  }
+
+  // Allocate scratch storage for softmax.
+  void *max_scratch_d = nullptr, *sum_scratch_d = nullptr;
+  if (ctas_per_head > 1) {
+    size_t sz = heads_per_wave * ctas_per_head * threads_per_cta * sizeof(float);
+    FMHA_CHECK_CUDA(cudaMalloc((void**)&max_scratch_d, sz));
+    FMHA_CHECK_CUDA(cudaMalloc((void**)&sum_scratch_d, sz));
+  }
+
+  // Allocate temporary storage for the parallel reduction.
+  void* o_scratch_d = nullptr;
+  if (ctas_per_head > 1 && data_type != DATA_TYPE_FP16) {
+    size_t sz = heads_per_wave * threads_per_cta * MAX_STGS_PER_LOOP * sizeof(uint4);
+    FMHA_CHECK_CUDA(cudaMalloc((void**)&o_scratch_d, sz));
+  }
+
+  // The number of heads computed per wave.
+  params.heads_per_wave = heads_per_wave;
+
+  // Barriers for the global sync in the multi-CTA kernel(s).
+  params.counters = (int*)counters_d + 0 * heads_per_wave;
+  params.max_barriers = (int*)counters_d + 0 * heads_per_wave;
+  params.sum_barriers = (int*)counters_d + 1 * heads_per_wave;
+  params.locks = (int*)counters_d + 2 * heads_per_wave;
+
+  // Scratch storage for softmax.
+  params.max_scratch_ptr = (float*)max_scratch_d;
+  params.sum_scratch_ptr = (float*)sum_scratch_d;
+
+  // Scratch storage for output.
+  params.o_scratch_ptr = (int*)o_scratch_d;
+
+  // Run a few warm-up kernels.
+  for (int ii = 0; ii < warm_up_runs; ++ii) {
+    run_fmhca(params, launch_params, data_type, sm, 0);
+  }
+  FMHA_CHECK_CUDA(cudaPeekAtLastError());
+
+  float non_fused_elapsed = INFINITY;
+  if (!skip_checks) {
+    // Run cuBLAS.
+
+    RefBMM bmm1(data_type_to_cuda(data_type),   // a
+                data_type_to_cuda(data_type),   // b
+                data_type_to_cuda(acc_type),    // d
+                data_type_to_cublas(acc_type),  // compute
+                data_type_to_cuda(acc_type),    // scale
+                false,                          // Q
+                true,                           // K'
+                s_q,                            // m
+                s_kv_padded,                    // n
+                d,                              // k
+                b * h * d,                      // ld Q
+                b * h * 2 * d,                  // ld K
+                b * h * s_kv_padded,            // ld P
+                d,                              // stride Q
+                2 * d,                          // stride K
+                s_kv_padded,                    // stride P
+                b * h                           // batch count
+    );
+
+    // WAR fOR MISSING CUBLAS FP8 NN SUPPORT.
+    // Transpose V, so that we can do a TN BMM2, i.e. O = S x V'  instead of O = S x V.
+    RefBMM bmm2(data_type_to_cuda(data_type),   // a
+                data_type_to_cuda(data_type),   // b
+                data_type_to_cuda(acc_type),    // d
+                data_type_to_cublas(acc_type),  // compute
+                data_type_to_cuda(acc_type),    // scale
+                false,                          // S
+                true,                           // V'
+                s_q,                            // m
+                d,                              // n
+                s_kv_padded,                    // k
+                b * h * s_kv_padded,            // ld S
+                s_kv_padded,                    // ld V
+                b * h * d,                      // ld O
+                s_kv_padded,                    // stride S
+                s_kv_padded * d,                // stride V
+                d,                              // stride O
+                b * h                           // batch count
+    );
+
+    timer.start();
+    ground_truth(bmm1, bmm2, data_type, acc_type, scale_bmm1, scale_softmax, scale_bmm2, q_d, kv_d,
+                 vt_d,  // WAR pass in V'
+                 mask_d, p_d, s_d, tmp_d, o_d, softmax_sum_d, cu_seqlens_q_d, b, s_q, s_kv_padded,
+                 h, d, runs, warps_m, warps_n, false);
+    timer.stop();
+    FMHA_CHECK_CUDA(cudaPeekAtLastError());
+    FMHA_CHECK_CUDA(cudaDeviceSynchronize());
+    non_fused_elapsed = timer.millis();
+
+#if defined(STORE_P)
+    FMHA_CHECK_CUDA(cuda_memcpy_d2h(p_ref_h, p_d, p_size, acc_type));
+#endif  // defined(STORE_P)
+
+#if defined(STORE_S)
+    FMHA_CHECK_CUDA(cuda_memcpy_d2h(s_ref_h, s_d, p_size, data_type));
+#endif  // defined(STORE_S)
+
+    // Read the results.
+    FMHA_CHECK_CUDA(cuda_memcpy_d2h(o_ref_h, o_d, o_size, data_type));
+  }
+
+  // Fill-in p/s/o with garbage data.
+  FMHA_CHECK_CUDA(cudaMemset(p_d, 0xdc, p_size_in_bytes));
+  FMHA_CHECK_CUDA(cudaMemset(s_d, 0xdc, s_size_in_bytes));
+  FMHA_CHECK_CUDA(cudaMemset(o_d, 0xdc, o_size_in_bytes));
+
+  // Run the kernel.
+  timer.start();
+  for (int ii = 0; ii < runs; ++ii) {
+    run_fmhca(params, launch_params, data_type, sm, 0);
+  }
+  timer.stop();
+  FMHA_CHECK_CUDA(cudaPeekAtLastError());
+
+  FMHA_CHECK_CUDA(cudaDeviceSynchronize());
+  float fused_elapsed = timer.millis();
+
+#if defined(STORE_P)
+  FMHA_CHECK_CUDA(cuda_memcpy_d2h(p_h, p_d, p_size, acc_type));
+  printf("\nChecking .....: P = norm * K^T * Q\n");
+
+  // Clear the invalid region of P.
+  set_mat<float>(p_ref_h, seqlens_q, seqlens_kv, s_q, b, h, s_kv_padded, 0.f, true);
+  set_mat<float>(p_h, seqlens_q, seqlens_kv, s_q, b, h, s_kv_padded, 0.f, true);
+
+  // Do the check.
+  check_results(p_h, p_ref_h, s_kv_padded,
+                cu_seqlens_q.back() /*not needed: * b -- already counted */ * h, s_kv_padded, 0.f,
+                true, true);
+#endif  // defined(STORE_P)
+
+#if defined(STORE_S)
+  FMHA_CHECK_CUDA(cuda_memcpy_d2h(s_h, s_d, p_size, data_type));
+  printf("\nChecking .....: S = softmax(P)\n");
+#if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+  float softmax_epsilon = data_type == DATA_TYPE_FP16 ? 1e-3f : 0.f;
+#else
+  float softmax_epsilon = 1.e-3f;
+#endif  // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+
+  // Clear the invalid region of S.
+  set_mat<float>(s_ref_h, seqlens_q, s_q, b, h, s_kv_padded, 0.f);
+  set_mat<float>(s_h, seqlens_q, s_q, b, h, s_kv_padded, 0.f);
+
+  // Do the check.
+  check_results(s_h, s_ref_h, s_kv_padded, cu_seqlens_q.back() * h, s_kv_padded, softmax_epsilon,
+                true, true);
+#endif  // defined(STORE_S)
+
+  // Check the final results.
+  int status = -1;
+  if (skip_checks) {
+    printf("\n");
+    print_results(true, false);
+    status = 0;
+  } else {
+    FMHA_CHECK_CUDA(cuda_memcpy_d2h(o_packed_h, o_packed_d, o_packed_size, data_type));
+
+    if (interleaved) {
+      // revert batch-interleaved format: 3 x h/32 x total x d x 32 => total x
+      // h x 3 x d
+      x_vec32(false, o_packed_h, h, total_q, 1);
+    }
+
+    extract_and_transpose_output<float>(o_ref_packed_h, o_ref_h, seqlens_q, s_q, b, h, d);
+
+    if (verbose) {
+      printf("\nChecking .....: O = V * S\n");
+    }
+
+    status = check_results(o_packed_h, o_ref_packed_h, d, cu_seqlens_q.back() * h, d, epsilon,
+                           verbose, true);
+
+    expand_and_transpose_output<float>(o_h, o_packed_h, seqlens_q, s_q, b, h, d);
+    eval(o_ref_h, o_h, seqlens_q, b, s_q, h, d, verbose);
+    // printf("%f %f\n", o_packed_h[0], o_ref_h[0]);
+
+    if (status != 0) {  // if there was an error, print the config of the run
+      printf("v1=%d il=%d s_q=%lu s_kv=%lu b=%lu h=%lu d=%lu dtype=%s\n", v1, interleaved, s_q,
+             s_kv, b, h, d, data_type_to_name(data_type).c_str());
+    }
+
+    if (!verbose) {  // this just prints the SUCCESS/ERROR line
+      print_results(true, true, status == 0);
+    }
+  }
+
+  if (verbose) {
+    // Runtimes.
+    printf("\n");
+    if (skip_checks) {
+      printf("Non-fused time: %.6fms\n", non_fused_elapsed / float(runs));
+    }
+    printf("Fused time ...: %.6fms\n", fused_elapsed / float(runs));
+    if (!skip_checks) {
+      printf("Ratio ........: %.2fx\n", non_fused_elapsed / fused_elapsed);
+    }
+  } else {
+    printf("Elapsed ......: %.6f (%.2fx)\n", fused_elapsed, non_fused_elapsed / fused_elapsed);
+  }
+#if defined(DEBUG_HAS_PRINT_BUFFER)
+  FMHA_CHECK_CUDA(
+      cuda_memcpy_d2h(print_buffer.data(), params.print_ptr, print_buffer.size(), DATA_TYPE_FP32));
+
+  printf("\n====================\n");
+  for (int it = 0; it < 16; it++) {
+    printf("% .4f ", print_buffer[it]);
+  }
+  printf("\n====================\n");
+
+  FMHA_CHECK_CUDA(cudaFree(params.print_ptr));
+
+#endif
+
+  // Release memory.
+  FMHA_CHECK_CUDA(cudaFree(q_d));
+  FMHA_CHECK_CUDA(cudaFree(kv_d));
+  FMHA_CHECK_CUDA(cudaFree(mask_d));
+  FMHA_CHECK_CUDA(cudaFree(packed_mask_d));
+  FMHA_CHECK_CUDA(cudaFree(p_d));
+  FMHA_CHECK_CUDA(cudaFree(s_d));
+  FMHA_CHECK_CUDA(cudaFree(o_d));
+  FMHA_CHECK_CUDA(cudaFree(tmp_d));
+  FMHA_CHECK_CUDA(cudaFree(cu_seqlens_q_d));
+  FMHA_CHECK_CUDA(cudaFree(cu_seqlens_kv_d));
+  FMHA_CHECK_CUDA(cudaFree(max_scratch_d));
+  FMHA_CHECK_CUDA(cudaFree(sum_scratch_d));
+  FMHA_CHECK_CUDA(cudaFree(o_scratch_d));
+  FMHA_CHECK_CUDA(cudaFree(counters_d));
+  FMHA_CHECK_CUDA(cudaFree(softmax_sum_d));
+  FMHA_CHECK_CUDA(cudaFree(softmax_max_d));
+
+  free(q_h);
+  free(kv_h);
+  free(mask_h);
+  free(packed_mask_h);
+  free(s_h);
+  free(o_h);
+  free(o_ref_h);
+
+  free(p_ref_h);
+#if defined(STORE_P)
+  free(p_h);
+#endif  // defined(STORE_P)
+  free(s_ref_h);
+
+  return status;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/fmha_v2/fused_multihead_cross_attention.h b/csrc/fmha_v2/fused_multihead_cross_attention.h
new file mode 100644
index 0000000000..caad06ef3c
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_cross_attention.h
@@ -0,0 +1,67 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <cuda.h>
+#include <fused_multihead_attention.h>
+#include <fused_multihead_attention_utils.h>
+
+#include <vector>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace bert {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if USE_DEMO_BERT_PARAMS
+
+// TODO TRT plugins use a different parameter struct taken from the old XMMA fork.
+//      Until all cubins in the plugin are replaced with new kernels, we need to conform to that.
+#include <fused_multihead_attention_demo_bert_params.h>
+
+#endif  // USE_DEMO_BERT_PARAMS
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Gmem_params {
+  // The matrix.
+  void* ptr;
+
+  // The stride between rows of the Q, K and V matrices.
+  int64_t stride_in_bytes;
+
+  // The number of heads
+  int h;
+
+  // Hidden dim per head
+  int d;
+
+  // array of length b+1 holding prefix sum of actual sequence lengths.
+  int* cu_seqlens;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Fused_multihead_attention_params_mhca : Fused_multihead_attention_params_v2 {
+  // Sequence length of Q
+  int s_q;
+  int d_padded;
+  bool force_unroll;
+  Gmem_params gmem_q_params;
+  Gmem_params gmem_kv_params;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace bert
diff --git a/csrc/fmha_v2/fused_multihead_cross_attention_kernel_1xN.h b/csrc/fmha_v2/fused_multihead_cross_attention_kernel_1xN.h
new file mode 100644
index 0000000000..496df3e616
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_cross_attention_kernel_1xN.h
@@ -0,0 +1,361 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/gemm.h>
+#include <fmha/kernel_traits.h>
+#include <fused_multihead_attention_kernel.h>
+
+namespace fused_multihead_attention {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_traits, typename Params>
+inline __device__ void device_mhca_1xN(Params const& params) {
+  // The instruction traits.
+  using Traits_p = typename Kernel_traits::Traits_p;
+  using Traits_o = typename Kernel_traits::Traits_o;
+
+  // The description of the CTA tile for the 1st batched GEMM.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  // The description of the CTA tile for the 2nd batched GEMM.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = typename Traits_p::template Mma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = typename Traits_o::template Mma_tile<Cta_tile_o>;
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
+  // The shared memory tile to swizzle Q.
+  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
+  // The shared memory tile to swizzle K.
+  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
+  // The shared memory tile to swizzle O.
+  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
+
+  // Do we use LDGSTS for Q, K or V?
+  enum { USE_LDGSTS_Q = Kernel_traits::USE_LDGSTS_Q };
+
+  enum { USE_LDGSTS_K = Kernel_traits::USE_LDGSTS_K };
+
+  enum { USE_LDGSTS_V = Kernel_traits::USE_LDGSTS_V };
+
+  // Do we use LDGSTS for any of the 3 input matrices.
+  enum { USE_LDGSTS = USE_LDGSTS_Q || USE_LDGSTS_K || USE_LDGSTS_V };
+
+  // If either K or V uses LDGSTS, they cannot share a buffer.
+  static_assert(!(USE_LDGSTS_K || USE_LDGSTS_V) || !Kernel_traits::SHARE_SMEM_FOR_K_AND_V, "");
+
+  // Shared memory.
+  extern __shared__ char smem_[];
+
+  // The block index for the batch.
+  int const bidb = blockIdx.y;
+  // The block index for the head.
+  int const bidh = blockIdx.x;
+  // The thread index.
+  int const tidx = threadIdx.x;
+
+  // The block info
+  Single_cta<Kernel_traits::VERSION> const binfo_q(params.gmem_q_params, bidb, bidh, 0, tidx);
+  Single_cta<Kernel_traits::VERSION> const binfo_kv(params.gmem_kv_params, bidb, bidh, 0, tidx);
+
+  // Early exit for short sequences.
+  if (binfo_q.stop_early() || binfo_kv.stop_early()) {
+    return;
+  }
+
+  // Create the object to control the masks.
+  fmha::Mask<Traits_p, Cta_tile_p, Kernel_traits::VERSION> mask(params, binfo_kv, tidx);
+
+  // Allocate the global memory tile loader for Q.
+  Gmem_tile_q gmem_q(params.gmem_q_params, 0, binfo_q, tidx);
+  // Allocate the shared memory tile loader for Q.
+  Smem_tile_q smem_q(&smem_[0], tidx);
+
+  // Allocate the global memory tile loader for K.
+  Gmem_tile_k gmem_k(params.gmem_kv_params, 0, binfo_kv, tidx);
+  // Allocate the shared memory tile loader for K.
+  Smem_tile_k smem_k(&smem_[Smem_tile_q::BYTES_PER_TILE], tidx);
+
+  // Allocate the global memory tile loader for V.
+  Gmem_tile_v gmem_v(params.gmem_kv_params, 1, binfo_kv, tidx);
+
+  // The base pointer of smem_v;
+  char* smem_v_ = nullptr;
+  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    smem_v_ = &smem_[Smem_tile_q::BYTES_PER_TILE];
+  } else {
+    smem_v_ = &smem_[Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE];
+  }
+  // Allocate the shared memory tile loader for V. We use the same as K so be careful!!!
+  Smem_tile_v smem_v(smem_v_, tidx);
+
+  // Allocate the global memory tile loader for O.
+  Gmem_tile_o gmem_o(params, binfo_q, tidx);
+  // Allocate the shared memory tile loader for O. We use the same as K so be careful!!!
+  Smem_tile_o smem_o(&smem_[Smem_tile_q::BYTES_PER_TILE], tidx);
+
+  // Trigger the loads for Q.
+  gmem_q.load(smem_q);
+  // Trigger the loads for K.
+  gmem_k.load(smem_k);
+  // Trigger the loads for K.
+  gmem_v.load(smem_v);
+
+  // Push the LDGDEPBAR instruction after the loads for Q, K and V.
+  fmha::ldgdepbar<USE_LDGSTS>();
+
+  // Commit the data for Q and K to shared memory.
+  gmem_q.commit(smem_q);
+  gmem_k.commit(smem_k);
+
+  // Commit the data for V to shared memory.
+  if (!Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    gmem_v.commit(smem_v);
+  }
+
+  // Make sure the data is in shared memory.
+  fmha::depbar<USE_LDGSTS, 1>();
+  __syncthreads();
+
+  // Load the fragments for Q.
+  typename Smem_tile_q::Fragment frag_q[2][Mma_tile_p::MMAS_M];
+  smem_q.load(frag_q[0], 0);
+
+  // Load the fragments for K. We keep the data in registers during the entire kernel.
+  typename Smem_tile_k::Fragment frag_k[Mma_tile_p::MMAS_K][Mma_tile_p::MMAS_N];
+#pragma unroll
+  for (int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki) {
+    smem_k.load(frag_k[ki], ki);
+  }
+
+  // Commit the data for V to shared memory if it has not been done already.
+  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    // Make sure we are done loading the fragments for K.
+    __syncthreads();
+
+    // Commit the data to shared memory for V.
+    gmem_v.commit(smem_v);
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+  }
+
+  // Load the fragments for V. We keep the data in registers during the entire kernel.
+  typename Smem_tile_v::Fragment frag_v[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_N];
+#pragma unroll
+  for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+    smem_v.load(frag_v[ki], ki);
+  }
+
+  // Store/load P to/from memory (for debugging).
+#if defined(STORE_P)
+  enum { BITS_PER_ELT_P = sizeof(typename Traits_p::Accumulator_type) * 8 };
+
+  using Gmem_tile_p = fmha::Gmem_tile_p<Traits_p, Cta_tile_p, BITS_PER_ELT_P>;
+  Gmem_tile_p gmem_p(params.p_ptr, params.p_stride_in_bytes, params.scale_bmm1, tidx);
+#endif
+
+  // Store S to memory (for debugging). NOTE: We use A_type as C_type is int32 for IMMA???
+#if defined(STORE_S)
+  enum { BITS_PER_ELT_S = sizeof(typename Traits_p::A_type) * 8 };
+
+  using Gmem_tile_s = fmha::Gmem_tile_s<Traits_p, Cta_tile_p, BITS_PER_ELT_S>;
+  Gmem_tile_s gmem_s(params.s_ptr, params.s_stride_in_bytes, params.scale_softmax, tidx);
+#endif
+
+  // Create the object to do the softmax.
+  using Softmax = fmha::Softmax<Traits_p, Cta_tile_p, Kernel_traits>;
+  Softmax softmax(params, &smem_[Smem_tile_q::BYTES_PER_TILE], bidb, tidx);
+
+  // The number of threads per row.
+  // enum { THREADS_PER_ROW = Cta_tile_p::WARPS_N * 8 };
+  // DEBUG.
+  // static_assert(THREADS_PER_ROW == 32, "");
+  // END OF DEBUG.
+  enum { THREADS_PER_ROW = 32 };
+
+  // Load over the entire sequence length.
+  for (int loop = 0, outer = 0; loop < params.s_q; loop += Cta_tile_p::M, outer++) {
+    // If we have reached the length of the sequence, stop earlier.
+    if (loop >= binfo_q.actual_seqlen) {
+      break;
+    }
+
+    // Declare the accumulators for the 1st gemm.
+    fmha::Fragment_accumulator<Traits_p> acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
+    using Acc_type_p = typename Traits_p::Accumulator_type;
+    fmha::Clear_accumulator<Acc_type_p, Cta_tile_p::WARPS_K>::apply(acc_p);
+
+// Do this part of P^T = (Q * K^T)^T.
+#pragma unroll
+    for (int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki) {
+      // Trigger the load from shared memory for the next series of Q values.
+      smem_q.load(frag_q[ki & 1], ki);
+      // Do the math for the values already in registers.
+      fmha::gemm(acc_p, frag_q[(ki - 1) & 1], frag_k[(ki - 1)]);
+    }
+
+    // Do the final stage of math.
+    {
+      int ki = Mma_tile_p::MMAS_K;
+      fmha::gemm(acc_p, frag_q[(ki - 1) & 1], frag_k[(ki - 1)]);
+    }
+
+    // Store the P matrix.
+#if defined(STORE_P)
+    gmem_p.store(acc_p);
+#endif
+
+    // Load the mask for that iteration.
+    mask.load(outer);
+
+    // Convert from the accumulator type to FP32 for Softmax.
+    softmax.unpack(acc_p);
+
+    // Apply the mask.
+    if (params.has_alibi) {
+      softmax.apply_mask_alibi(mask, bidh, params.alibi_params);
+    } else {
+      softmax.apply_mask(mask);
+    }
+
+    // Make sure we are done reading the data.
+    if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V && loop == 0) {
+      __syncthreads();
+    }
+
+    // Enable our trick to use the max for INT8 to scale.
+    if (Kernel_traits::USE_SCALE_MAX) {
+      // 16129 == 127 ^ 2.
+      float p_max = reinterpret_cast<float const&>(params.scale_bmm1) * 16129.f;
+      softmax.apply_exp(p_max);
+    } else {
+      // Compute the max.
+      float p_max[Softmax::ROWS_PER_THREAD];
+      softmax.template reduce<fmha::Max_>(p_max);
+
+      // Make sure we are done reading shared memory.
+      __syncthreads();
+
+      // Compute the exponential value.
+      softmax.apply_exp(p_max);
+    }
+
+    // Compute the sum.
+    float p_sum[Softmax::ROWS_PER_THREAD];
+    softmax.template reduce<fmha::Sum_>(p_sum);
+
+    // Finalize softmax on the accumulators of P^T.
+    softmax.scale(p_sum);
+
+    // Store the P matrix.
+#if defined(STORE_S)
+    softmax.store(gmem_s);
+#endif
+
+#if defined(STORE_P)
+    gmem_p.move();
+#endif
+
+#if defined(STORE_S)
+    gmem_s.move();
+#endif
+
+    // Trigger the load for the next Q values.
+    if (loop + Cta_tile_p::M < params.s_q) {
+      smem_q.move_to_next_write_buffer();
+      gmem_q.move();
+      gmem_q.load(smem_q);
+    }
+
+    // Make sure we have the LDGDEPBAR in place.
+    fmha::ldgdepbar<USE_LDGSTS_Q>();
+
+    // Repack for the next BMM.
+    fmha::Fragment_a<Traits_p, fmha::Row> frag_p[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
+    softmax.pack(frag_p);
+
+    // Declare the accumulators for the 1st gemm.
+    fmha::Fragment_accumulator<Traits_o> acc_o[Mma_tile_o::MMAS_M][Mma_tile_o::MMAS_N];
+    using Acc_type_o = typename Traits_o::Accumulator_type;
+    fmha::Clear_accumulator<Acc_type_o, Cta_tile_o::WARPS_K>::apply(acc_o);
+
+// Do this part of O = P^T * V^T.
+#pragma unroll
+    for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+      fmha::gemm(acc_o, frag_p[ki], frag_v[ki]);
+    }
+
+// Loop over MMAS_M.
+#pragma unroll
+    for (int ii = 0; ii < Gmem_tile_o::LOOPS; ++ii) {
+      // Swizzle the elements and do the final reduction.
+      smem_o.store(acc_o, ii);
+
+      // Make sure the data is in shared memory.
+      __syncthreads();
+
+      // Load from shared memory.
+      uint4 out[Gmem_tile_o::STGS_PER_LOOP];
+      smem_o.load(out);
+
+      // Make sure the data was read from shared memory.
+      if (ii < Gmem_tile_o::LOOPS - 1) {
+        __syncthreads();
+      }
+
+      // Output the values.
+      gmem_o.store(out, ii);
+    }
+
+    // Move to the next part of the output.
+    gmem_o.move();
+
+    // Commit the values for Q into shared memory.
+    if (loop + Cta_tile_p::M < params.s_q) {
+      gmem_q.commit(smem_q);
+    }
+
+    // Make sure we are reading from the correct buffer.
+    if (USE_LDGSTS_Q) {
+      smem_q.move_to_next_read_buffer();
+    }
+
+    // Make sure the data is in shared memory.
+    fmha::depbar<USE_LDGSTS_Q, 1>();
+    __syncthreads();
+
+    // Trigger the loads for the values of Q for the next iteration.
+    smem_q.load(frag_q[0], 0);
+
+  }  // Outer loop over the sequence length.
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fused_multihead_attention
diff --git a/csrc/fmha_v2/fused_multihead_cross_attention_kernel_1xN_noloop.h b/csrc/fmha_v2/fused_multihead_cross_attention_kernel_1xN_noloop.h
new file mode 100644
index 0000000000..ba2e045ab3
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_cross_attention_kernel_1xN_noloop.h
@@ -0,0 +1,319 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/gemm.h>
+#include <fmha/kernel_traits.h>
+#include <fused_multihead_attention_kernel.h>
+
+namespace fused_multihead_attention {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_traits, typename Params>
+inline __device__ void device_mhca_1xN_nl(Params const& params) {
+  // The instruction traits.
+  using Traits_p = typename Kernel_traits::Traits_p;
+  using Traits_o = typename Kernel_traits::Traits_o;
+
+  // The description of the CTA tile for the 1st batched GEMM.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  // The description of the CTA tile for the 2nd batched GEMM.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = typename Traits_p::template Mma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = typename Traits_o::template Mma_tile<Cta_tile_o>;
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
+  // The shared memory tile to swizzle Q.
+  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
+  // The shared memory tile to swizzle K.
+  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
+  // The shared memory tile to swizzle O.
+  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
+
+  // Do we use LDGSTS for Q, K or V?
+  enum { USE_LDGSTS_Q = Kernel_traits::USE_LDGSTS_Q };
+
+  enum { USE_LDGSTS_K = Kernel_traits::USE_LDGSTS_K };
+
+  enum { USE_LDGSTS_V = Kernel_traits::USE_LDGSTS_V };
+
+  // Do we use LDGSTS for any of the 3 input matrices.
+  enum { USE_LDGSTS = USE_LDGSTS_Q || USE_LDGSTS_K || USE_LDGSTS_V };
+
+  // If either K or V uses LDGSTS, they cannot share a buffer.
+  static_assert(!(USE_LDGSTS_K || USE_LDGSTS_V) || !Kernel_traits::SHARE_SMEM_FOR_K_AND_V, "");
+
+  // Shared memory.
+  extern __shared__ char smem_[];
+
+  // The loop -- each CTA works on a different loop iteration.
+  int const loop = blockIdx.z;
+  // The block index for the batch.
+  int const bidb = blockIdx.y;
+  // The block index for the head.
+  int const bidh = blockIdx.x;
+  // The thread index.
+  int const tidx = threadIdx.x;
+
+  // The block info.
+  Single_cta<Kernel_traits::VERSION> const binfo_q(params.gmem_q_params, bidb, bidh, 0, tidx);
+  Single_cta<Kernel_traits::VERSION> const binfo_kv(params.gmem_kv_params, bidb, bidh, 0, tidx);
+
+  // Early exit if the sequence is shorter.
+  if (binfo_q.stop_early(loop * Cta_tile_o::M)) {
+    return;
+  }
+
+  // Create the object to control the masks.
+  fmha::Mask<Traits_p, Cta_tile_p, Kernel_traits::MASK_VERSION> mask(params, binfo_kv, tidx);
+
+  // Allocate the global memory tile loader for Q.
+  Gmem_tile_q gmem_q(params.gmem_q_params, 0, binfo_q, tidx, loop * Gmem_tile_q::ROWS);
+  // Allocate the shared memory tile loader for Q.
+  Smem_tile_q smem_q(&smem_[0], tidx);
+
+  // Allocate the global memory tile loader for K.
+  Gmem_tile_k gmem_k(params.gmem_kv_params, 0, binfo_kv, tidx);
+  // Allocate the shared memory tile loader for K.
+  Smem_tile_k smem_k(&smem_[Smem_tile_q::BYTES_PER_TILE], tidx);
+
+  // Allocate the global memory tile loader for V.
+  Gmem_tile_v gmem_v(params.gmem_kv_params, 1, binfo_kv, tidx);
+
+  // The base pointer of smem_v;
+  char* smem_v_ = nullptr;
+  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    smem_v_ = &smem_[Smem_tile_q::BYTES_PER_TILE];
+  } else {
+    smem_v_ = &smem_[Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE];
+  }
+
+  // Allocate the shared memory tile loader for V. We use the same as K so be careful!!!
+  Smem_tile_v smem_v(smem_v_, tidx);
+
+  // Allocate the global memory tile loader for O.
+  Gmem_tile_o gmem_o(params, binfo_q, tidx, loop * Gmem_tile_o::ROWS);
+
+  // Store/load P to/from memory (for debugging).
+#if defined(STORE_P)
+  enum { BITS_PER_ELT_P = sizeof(typename Traits_p::Accumulator_type) * 8 };
+
+  using Gmem_tile_p = fmha::Gmem_tile_ps<Traits_p, Cta_tile_p, BITS_PER_ELT_P>;
+  char* p_ptr = reinterpret_cast<char*>(params.p_ptr);
+  p_ptr += loop * Cta_tile_p::M * params.p_stride_in_bytes;
+  Gmem_tile_p gmem_p(p_ptr, params.p_stride_in_bytes, params.scale_bmm1, tidx);
+#endif
+
+  // Store S to memory (for debugging). NOTE: We use A_type as C_type is int32 for IMMA???
+#if defined(STORE_S)
+  enum { BITS_PER_ELT_S = sizeof(typename Traits_p::A_type) * 8 };
+
+  using Gmem_tile_s = fmha::Gmem_tile_ps<Traits_p, Cta_tile_p, BITS_PER_ELT_S>;
+  char* s_ptr = reinterpret_cast<char*>(params.s_ptr);
+  s_ptr += loop * Cta_tile_p::M * params.s_stride_in_bytes;
+  Gmem_tile_s gmem_s(s_ptr, params.s_stride_in_bytes, params.scale_softmax, tidx);
+#endif
+
+  // Allocate the shared memory tile loader for O. We use the same as K so be careful!!!
+  Smem_tile_o smem_o(&smem_[Smem_tile_q::BYTES_PER_TILE], tidx);
+
+  // Trigger the loads for Q.
+  gmem_q.load(smem_q);
+  // Trigger the loads for K.
+  gmem_k.load(smem_k);
+  // Trigger the loads for V.
+  gmem_v.load(smem_v);
+
+  // Push the LDGDEPBAR instruction after the loads for Q, K and V.
+  fmha::ldgdepbar<USE_LDGSTS>();
+
+  // Commit the data for Q and K to shared memory.
+  gmem_q.commit(smem_q);
+  gmem_k.commit(smem_k);
+
+  // Commit the data for V to shared memory.
+  if (!Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    gmem_v.commit(smem_v);
+  }
+
+  // Make sure the data is in shared memory.
+  fmha::depbar<USE_LDGSTS, 1>();
+  __syncthreads();
+
+  // Load the fragments for Q.
+  typename Smem_tile_q::Fragment frag_q[2][Mma_tile_p::MMAS_M];
+  smem_q.load(frag_q[0], 0);
+
+  // Load the fragments for K. We keep the data in registers during the entire kernel.
+  typename Smem_tile_k::Fragment frag_k[Mma_tile_p::MMAS_K][Mma_tile_p::MMAS_N];
+#pragma unroll
+  for (int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki) {
+    smem_k.load(frag_k[ki], ki);
+  }
+
+  // Commit the data for V to shared memory if it has not been done already.
+  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    // Make sure we are done loading the fragments for K.
+    __syncthreads();
+
+    // Commit the data to shared memory for V.
+    gmem_v.commit(smem_v);
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+  }
+
+  // Load the fragments for V. We keep the data in registers during the entire kernel.
+  typename Smem_tile_v::Fragment frag_v[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_N];
+#pragma unroll
+  for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+    smem_v.load(frag_v[ki], ki);
+  }
+
+  // Create the object to do the softmax.
+  using Softmax = fmha::Softmax<Traits_p, Cta_tile_p, Kernel_traits>;
+  Softmax softmax(params, &smem_[Smem_tile_q::BYTES_PER_TILE], bidb, tidx);
+
+  // The number of threads per row.
+  enum { THREADS_PER_ROW = 32 };
+
+  // Declare the accumulators for the 1st gemm.
+  fmha::Fragment_accumulator<Traits_p> acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
+  fmha::Clear_accumulator<typename Traits_p::Accumulator_type, Cta_tile_p::WARPS_K>::apply(acc_p);
+
+// Do this part of P^T = (Q * K^T)^T.
+#pragma unroll
+  for (int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki) {
+    // Trigger the load from shared memory for the next series of Q values.
+    smem_q.load(frag_q[ki & 1], ki);
+    // Do the math for the values already in registers.
+    fmha::gemm(acc_p, frag_q[(ki - 1) & 1], frag_k[(ki - 1)]);
+  }
+
+  // Do the final stage of math.
+  {
+    int ki = Mma_tile_p::MMAS_K;
+    fmha::gemm(acc_p, frag_q[(ki - 1) & 1], frag_k[(ki - 1)]);
+  }
+
+  // Store the P matrix.
+#if defined(STORE_P)
+  gmem_p.store(acc_p);
+#endif
+
+  // Convert from the accumulator type to FP32 for Softmax.
+  softmax.unpack(acc_p);
+
+  // Move the mask to the correct position. Load extra data if needed.
+  mask.load(loop);
+
+  // Apply the mask.
+  if (params.has_alibi) {
+    softmax.apply_mask_alibi(mask, bidh, params.alibi_params);
+  } else {
+    softmax.apply_mask(mask);
+  }
+
+  // Make sure we are done reading from shared memory.
+  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    __syncthreads();
+  }
+
+  // Apply the INT8 hack.
+  if (Kernel_traits::USE_SCALE_MAX) {
+    // 16129 == 127 ^ 2.
+    float p_max = reinterpret_cast<float const&>(params.scale_bmm1) * 16129.f;
+    softmax.apply_exp(p_max);
+  } else {
+    // Compute the max.
+    float p_max[Softmax::ROWS_PER_THREAD];
+    softmax.template reduce<fmha::Max_>(p_max);
+
+    // Make sure we are done reading shared memory.
+    __syncthreads();
+
+    // Compute the exponential value.
+    softmax.apply_exp(p_max);
+  }
+
+  // Compute the sum.
+  float p_sum[Softmax::ROWS_PER_THREAD];
+  softmax.template reduce<fmha::Sum_>(p_sum);
+
+  // Finalize softmax on the accumulators of P^T.
+  softmax.scale(p_sum);
+
+  // Store the P matrix.
+#if defined(STORE_S)
+  softmax.store(gmem_s);
+#endif
+
+  // Prepare the data for the second BMM.
+  fmha::Fragment_a<Traits_p, fmha::Row> frag_p[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
+  softmax.pack(frag_p);
+
+  // Declare the accumulators for the 1st gemm.
+  fmha::Fragment_accumulator<Traits_o> acc_o[Mma_tile_o::MMAS_M][Mma_tile_o::MMAS_N];
+  fmha::Clear_accumulator<typename Traits_o::Accumulator_type, Cta_tile_o::WARPS_K>::apply(acc_o);
+
+  // Make sure we have the LDGDEPBAR in place.
+  fmha::ldgdepbar<USE_LDGSTS_Q>();
+
+// Do this part of O = P^T * V^T.
+#pragma unroll
+  for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+    fmha::gemm(acc_o, frag_p[ki], frag_v[ki]);
+  }
+
+// Loop over MMAS_M.
+#pragma unroll
+  for (int ii = 0; ii < Gmem_tile_o::LOOPS; ++ii) {
+    // Swizzle the elements and do the final reduction.
+    smem_o.store(acc_o, ii);
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+
+    // Load from shared memory.
+    uint4 out[Gmem_tile_o::STGS_PER_LOOP];
+    smem_o.load(out);
+
+    // Make sure the data was read from shared memory.
+    if (ii < Gmem_tile_o::LOOPS - 1) {
+      __syncthreads();
+    }
+
+    // Output the values.
+    gmem_o.store(out, ii);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fused_multihead_attention
diff --git a/csrc/fmha_v2/fused_multihead_flash_attention_kernel.h b/csrc/fmha_v2/fused_multihead_flash_attention_kernel.h
new file mode 100644
index 0000000000..5960b271c4
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_flash_attention_kernel.h
@@ -0,0 +1,568 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/gemm.h>
+#include <fmha/kernel_traits.h>
+#include <fused_multihead_attention_kernel.h>
+
+namespace fused_multihead_attention {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_traits, typename Params>
+inline __device__ void device_flash_attention(Params const& params) {
+  // The instruction traits.
+  using Traits_p = typename Kernel_traits::Traits_p;
+  using Traits_o = typename Kernel_traits::Traits_o;
+
+  // The description of the CTA tile for the 1st batched GEMM.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  // The description of the CTA tile for the 2nd batched GEMM.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = typename Traits_p::template Mma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = typename Traits_o::template Mma_tile<Cta_tile_o>;
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
+  // The shared memory tile to swizzle Q.
+  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
+  // The shared memory tile to swizzle K.
+  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
+  // The shared memory tile to swizzle O.
+  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
+
+  // Do we use LDGSTS for Q, K or V?
+  enum { USE_LDGSTS_Q = Kernel_traits::USE_LDGSTS_Q };
+
+  enum { USE_LDGSTS_K = Kernel_traits::USE_LDGSTS_K };
+
+  enum { USE_LDGSTS_V = Kernel_traits::USE_LDGSTS_V };
+
+  enum { USE_LDGSTS_KV = Kernel_traits::USE_LDGSTS_K || Kernel_traits::USE_LDGSTS_V };
+
+  // Do we use LDGSTS for any of the 3 input matrices.
+  enum { USE_LDGSTS = USE_LDGSTS_Q || USE_LDGSTS_K || USE_LDGSTS_V };
+
+  // If either K or V uses LDGSTS, they cannot share a buffer.
+  static_assert(!(USE_LDGSTS_K || USE_LDGSTS_V) || !Kernel_traits::SHARE_SMEM_FOR_K_AND_V, "");
+
+  // Fragment double buffer (reduce register pressure)
+  enum {
+    FRAGMENT_K_SIZE_IN_K_DIM = (Kernel_traits::LIMIT_QK_FRAGMENTS) * 2 +
+                               !(Kernel_traits::LIMIT_QK_FRAGMENTS)*Mma_tile_p::MMAS_K
+  };
+
+  static_assert(!(Kernel_traits::LIMIT_QK_FRAGMENTS && USE_LDGSTS_K), "");
+
+  // Shared memory.
+  extern __shared__ char smem_[];
+
+  // The block index for the batch.
+  int const bidb = blockIdx.y;
+  // The block index for the head.
+  int const bidh = blockIdx.x;
+  // The thread index.
+  int const tidx = threadIdx.x;
+
+  // The block info.
+  Single_cta<Kernel_traits::VERSION> const binfo(params, bidb, bidh, 0, tidx);
+  if (binfo.stop_early()) {
+    return;
+  }
+
+  // Create the object to control the masks.
+  fmha::Mask<Traits_p, Cta_tile_p, Kernel_traits::MASK_VERSION> mask(params, binfo, tidx);
+
+  // Allocate the global memory tile loader for Q.
+  // Gmem_tile_q gmem_q(params, 0, binfo, tidx);
+  Gmem_tile_q gmem_q(params, 0, binfo, tidx);
+  // Allocate the shared memory tile loader for Q.
+  Smem_tile_q smem_q(&smem_[0], tidx);
+
+  // Allocate the global memory tile loader for K.
+  Gmem_tile_k gmem_k(params, 1, binfo, tidx);
+  // Allocate the shared memory tile loader for K.
+  Smem_tile_k smem_k(&smem_[Smem_tile_q::BYTES_PER_TILE], tidx);
+
+  // Allocate the global memory tile loader for V.
+  Gmem_tile_v gmem_v(params, 2, binfo, tidx);
+
+  // The base pointer of smem_v;
+  char* smem_v_ = nullptr;
+  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    smem_v_ = &smem_[Smem_tile_q::BYTES_PER_TILE];
+  } else {
+    smem_v_ = &smem_[Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE];
+  }
+  // Allocate the shared memory tile loader for V. We use the same as K so be careful!!!
+  Smem_tile_v smem_v(smem_v_, tidx);
+
+  // Allocate the global memory tile loader for O.
+  // Gmem_tile_o gmem_o(params, binfo, tidx);
+  Gmem_tile_o gmem_o(params, binfo, tidx);
+  // Allocate the shared memory tile loader for O. We use the same as K so be careful!!!
+  Smem_tile_o smem_o(&smem_[Smem_tile_q::BYTES_PER_TILE], tidx);
+
+  // Trigger the loads for Q.
+  gmem_q.load(smem_q);
+  // Trigger the loads for K.
+  gmem_k.load(smem_k);
+  // Trigger the loads for K.
+  gmem_v.load(smem_v);
+
+  // Push the LDGDEPBAR instruction after the loads for Q, K and V.
+  fmha::ldgdepbar<USE_LDGSTS>();
+
+  // Commit the data for Q and K to shared memory.
+  gmem_q.commit(smem_q);
+  gmem_k.commit(smem_k);
+
+  // Commit the data for V to shared memory.
+  if (!Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    gmem_v.commit(smem_v);
+  }
+
+  // Make sure the data is in shared memory.
+  fmha::depbar<USE_LDGSTS, 1>();
+  __syncthreads();
+
+  // Load the fragments for Q.
+  // NOTE: lds full frag_q as it will be used by all kv loops
+  typename Smem_tile_q::Fragment frag_q[Mma_tile_p::MMAS_K][Mma_tile_p::MMAS_M];
+#pragma unroll
+  for (int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki) {
+    smem_q.load(frag_q[ki], ki);
+  }
+
+  // Load the fragments for K. We keep the data in registers during the entire kernel.
+  typename Smem_tile_k::Fragment frag_k[FRAGMENT_K_SIZE_IN_K_DIM][Mma_tile_p::MMAS_N];
+  if (Kernel_traits::LIMIT_QK_FRAGMENTS) {
+    smem_k.load(frag_k[0], 0);
+  } else {
+#pragma unroll
+    for (int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki) {
+      smem_k.load(frag_k[ki], ki);
+    }
+  }
+
+  // Commit the data for V to shared memory if it has not been done already.
+  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    // Make sure we are done loading the fragments for K.
+    __syncthreads();
+
+    // Commit the data to shared memory for V.
+    gmem_v.commit(smem_v);
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+  }
+
+  // Load the fragments for V. We keep the data in registers during the entire kernel.
+  typename Smem_tile_v::Fragment frag_v[Mma_tile_o::MMAS_K][Mma_tile_o::VALID_MMAS_N];
+#pragma unroll
+  for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+    smem_v.load(frag_v[ki], ki);
+  }
+
+  // Store/load P to/from memory (for debugging).
+#if defined(STORE_P)
+  enum { BITS_PER_ELT_P = sizeof(typename Traits_p::Accumulator_type) * 8 };
+
+  using Gmem_tile_p = fmha::Gmem_tile_p<Traits_p, Cta_tile_p, BITS_PER_ELT_P>;
+  Gmem_tile_p gmem_p(params.p_ptr, params.p_stride_in_bytes, params.scale_bmm1, tidx);
+#endif
+
+  // Store S to memory (for debugging). NOTE: We use A_type as C_type is int32 for IMMA???
+#if defined(STORE_S)
+  enum { BITS_PER_ELT_S = sizeof(typename Traits_p::A_type) * 8 };
+
+  using Gmem_tile_s = fmha::Gmem_tile_s<Traits_p, Cta_tile_p, BITS_PER_ELT_S>;
+  Gmem_tile_s gmem_s(params.s_ptr, params.s_stride_in_bytes, params.scale_softmax, tidx);
+#endif
+
+  // Create the object to do the softmax.
+  using Softmax = fmha::Softmax<Traits_p, Cta_tile_p, Kernel_traits>;
+  Softmax softmax(params, &smem_[Smem_tile_q::BYTES_PER_TILE], bidb, tidx);
+
+  // Prefetch next kv buffer to share memory
+  enum {
+    PREFETCH_K_BUFFER_TO_SMEM = !Kernel_traits::LIMIT_QK_FRAGMENTS && !Softmax::USE_SHARED_MEMORY
+  };
+
+  enum { PREFETCH_V_BUFFER_TO_SMEM = !Kernel_traits::SHARE_SMEM_FOR_K_AND_V };
+
+  enum { PREFETCH_KV_BUFFER_TO_SMEM = PREFETCH_K_BUFFER_TO_SMEM && PREFETCH_V_BUFFER_TO_SMEM };
+
+  // The number of threads per row.
+  // enum { THREADS_PER_ROW = Cta_tile_p::WARPS_N * 8 };
+  // DEBUG.
+  // static_assert(THREADS_PER_ROW == 32, "");
+  // END OF DEBUG.
+  enum { THREADS_PER_ROW = 32 };
+
+  int const q_loop_bound =
+      ((binfo.actual_seqlen + Cta_tile_p::M - 1) / Cta_tile_p::M) * Cta_tile_p::M;
+
+  int const kv_loop_bound =
+      ((binfo.actual_seqlen + Cta_tile_p::N - 1) / Cta_tile_p::N) * Cta_tile_p::N;
+  // Load over the entire sequence length.
+  for (int q_loop = 0, outer = 0; q_loop < q_loop_bound; q_loop += Cta_tile_p::M, outer++) {
+    // Reset the mask col.
+    mask.reset();
+    // Load the mask row.
+    mask.load(outer);
+
+    // Declare the accumulators for the 1st gemm.
+    fmha::Fragment_accumulator<Traits_o> acc_o[Mma_tile_o::MMAS_M][Mma_tile_o::VALID_MMAS_N];
+    using Acc_type_o = typename Traits_o::Accumulator_type;
+    fmha::Clear_accumulator<Acc_type_o, Cta_tile_o::WARPS_K>::apply(acc_o);
+
+    // Flash attention updater
+    fmha::Fragment_updater<Traits_o, Cta_tile_o> acc_o_updater;
+
+    for (int kv_loop = 0; kv_loop < kv_loop_bound; kv_loop += Cta_tile_p::N) {
+      // NOTE: causal mask
+      if (Kernel_traits::MASK_VERSION == 3 && q_loop + Cta_tile_p::M <= kv_loop) {
+        break;
+      }
+
+      // Trigger the load for the next K/V values (smem_k double buffer).
+      if (kv_loop + Cta_tile_p::N < kv_loop_bound && PREFETCH_KV_BUFFER_TO_SMEM) {
+        // Make sure we are done reading the data (smem).
+        if (!(Smem_tile_k::BUFFERS_PER_TILE > 1 && Smem_tile_v::BUFFERS_PER_TILE > 1)) {
+          __syncthreads();
+        }
+
+        gmem_k.move();
+        smem_k.move_to_next_write_buffer();
+        gmem_k.load(smem_k);
+
+        gmem_v.move();
+        smem_v.move_to_next_write_buffer();
+        gmem_v.load(smem_v);
+
+        // Push the LDGDEPBAR instruction after the loads for K.
+        fmha::ldgdepbar<USE_LDGSTS>();
+
+        gmem_k.commit(smem_k);
+        if (!Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+          gmem_v.commit(smem_v);
+        }
+      }
+
+      // Declare the accumulators for the 1st gemm.
+      fmha::Fragment_accumulator<Traits_p> acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
+      using Acc_type_p = typename Traits_p::Accumulator_type;
+      fmha::Clear_accumulator<Acc_type_p, Cta_tile_p::WARPS_K>::apply(acc_p);
+
+// Do this part of P^T = (Q * K^T)^T.
+#pragma unroll
+      for (int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki) {
+        int k_ki = (ki - 1);
+        if (Kernel_traits::LIMIT_QK_FRAGMENTS) {
+          k_ki = (ki - 1) & 1;
+          smem_k.load(frag_k[(ki & 1)], ki);
+        }
+
+        // Do the math for the values already in registers.
+        if (ki <= Mma_tile_p::VALID_MMAS_K) {
+          fmha::gemm(acc_p, frag_q[(ki - 1)], frag_k[k_ki]);
+        }
+      }
+
+      if (Mma_tile_p::MMAS_K <= Mma_tile_p::VALID_MMAS_K) {
+        int ki = Mma_tile_p::MMAS_K;
+        int k_ki = (ki - 1);
+        if (Kernel_traits::LIMIT_QK_FRAGMENTS) {
+          k_ki = (ki - 1) & 1;
+        }
+        fmha::gemm(acc_p, frag_q[(ki - 1)], frag_k[k_ki]);
+      }
+
+      // Store the P matrix.
+#if defined(STORE_P)
+      gmem_p.store(acc_p);
+#endif
+
+      // Convert from the accumulator type to FP32 for Softmax.
+      softmax.unpack(acc_p);
+
+      // Apply the mask.
+      if (params.has_alibi) {
+        softmax.apply_mask_alibi(mask, bidh, params.alibi_params);
+      } else {
+        softmax.apply_mask(mask);
+      }
+
+      // Move mask to next kv tile
+      mask.move();
+
+      // Make sure we are done reading the data (smem_v).
+      if ((Kernel_traits::SHARE_SMEM_FOR_K_AND_V || Kernel_traits::LIMIT_QK_FRAGMENTS) &&
+          Softmax::USE_SHARED_MEMORY) {
+        __syncthreads();
+      }
+
+      // Enable our trick to use the max for INT8 to scale.
+      // float p_max[Softmax::ROWS_PER_THREAD];
+      if (Kernel_traits::USE_SCALE_MAX) {
+        // 16129 == 127 ^ 2.
+        float p_max = reinterpret_cast<float const&>(params.scale_bmm1) * 16129.f;
+        softmax.apply_exp(p_max);
+      } else {
+        // Compute the max.
+        softmax.template reduce<fmha::Max_>(acc_o_updater.prev_max_);
+
+        // Update acc_max scale of flash attention
+        acc_o_updater.update_acc_max();
+
+        if (Softmax::USE_SHARED_MEMORY) {
+          // Make sure we are done reading shared memory.
+          __syncthreads();
+        }
+
+        // Compute the exponential value.
+        softmax.apply_exp(acc_o_updater.curr_max_);
+      }
+
+      // Compute the sum.
+      // float p_sum[Softmax::ROWS_PER_THREAD];
+      softmax.template reduce<fmha::Sum_>(acc_o_updater.prev_sum_);
+
+      // Trigger the load for the next K/V values.
+      if (kv_loop + Cta_tile_p::N < kv_loop_bound && !PREFETCH_KV_BUFFER_TO_SMEM) {
+        gmem_k.move();
+        gmem_v.move();
+        smem_k.move_to_next_write_buffer();
+        smem_v.move_to_next_write_buffer();
+        gmem_k.load(smem_k);
+        gmem_v.load(smem_v);
+
+        // Push the LDGDEPBAR instruction after the loads for Q, K and V.
+        fmha::ldgdepbar<USE_LDGSTS>();
+
+        gmem_k.commit(smem_k);
+        if (!Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+          gmem_v.commit(smem_v);
+        }
+      }
+
+      // Update acc_sum scale of flash attention
+      acc_o_updater.update_acc_sum();
+
+      // Finalize softmax on the accumulators of P^T.
+      // NOTE: don't scale here (scale finally)
+      // softmax.scale(p_sum);
+
+      // Store the P matrix.
+#if defined(STORE_S)
+      softmax.store(gmem_s);
+#endif
+
+#if defined(STORE_P)
+      gmem_p.move();
+#endif
+
+#if defined(STORE_S)
+      gmem_s.move();
+#endif
+
+      if (kv_loop + Cta_tile_p::N < kv_loop_bound) {
+        fmha::depbar<USE_LDGSTS, 1>();
+        __syncthreads();
+
+        smem_k.move_to_next_read_buffer();
+        if (Kernel_traits::LIMIT_QK_FRAGMENTS) {
+          smem_k.load(frag_k[0], 0);
+        } else {
+#pragma unroll
+          for (int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki) {
+            smem_k.load(frag_k[ki], ki);
+          }
+        }
+      }
+
+      // Repack for the next BMM.
+      fmha::Fragment_a<Traits_p, fmha::Row> frag_p[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
+      softmax.pack(frag_p);
+
+      fmha::Fragment_accumulator<Traits_o> local_acc_o[Mma_tile_o::MMAS_M]
+                                                      [Mma_tile_o::VALID_MMAS_N];
+      using Acc_type_o = typename Traits_o::Accumulator_type;
+      fmha::Clear_accumulator<Acc_type_o, Cta_tile_o::WARPS_K>::apply(local_acc_o);
+
+// Do this part of O = P^T * V^T.
+#pragma unroll
+      for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+        fmha::gemm(local_acc_o, frag_p[ki], frag_v[ki]);
+      }
+
+      // Update acc_o of flash attention
+      acc_o_updater.update_o(acc_o, local_acc_o);
+
+      if (kv_loop + Cta_tile_p::N < kv_loop_bound) {
+        // Commit the data for V to shared memory if it has not been done already.
+        if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+          // Make sure we are done loading the fragments for K.
+          __syncthreads();
+
+          // Commit the data to shared memory for V.
+          gmem_v.commit(smem_v);
+
+          // Make sure the data is in shared memory.
+          __syncthreads();
+        }
+
+        smem_v.move_next_read_buffer();
+
+#pragma unroll
+        for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+          smem_v.load(frag_v[ki], ki);
+        }
+      }
+    }  // Inner loop over the key/value sequence length.
+
+    // Trigger the load for the next Q, K, V values.
+    if (q_loop + Cta_tile_p::M < q_loop_bound) {
+      smem_q.move_to_next_write_buffer();
+      gmem_q.move();
+      gmem_q.load(smem_q);
+      smem_k.move_to_next_write_buffer();
+      smem_v.move_to_next_write_buffer();
+      gmem_k.reset();
+      gmem_v.reset();
+      if (!USE_LDGSTS_KV) {
+        gmem_k.load(smem_k);
+        gmem_v.load(smem_v);
+      }
+    }
+
+    // Make sure we have the LDGDEPBAR in place.
+    fmha::ldgdepbar<USE_LDGSTS>();
+
+// Loop over MMAS_M.
+// NOTE: o and might shared the same buffer with kv
+#pragma unroll
+    for (int ii = 0; ii < Gmem_tile_o::LOOPS; ++ii) {
+      // Swizzle the elements and do the final reduction.
+      smem_o.store(acc_o, ii);
+
+      // Make sure the data is in shared memory.
+      __syncthreads();
+
+      // Load from shared memory.
+      uint4 out[Gmem_tile_o::STGS_PER_LOOP];
+      smem_o.load(out);
+
+      // Make sure the data was read from shared memory.
+      __syncthreads();
+
+      // Output the values.
+      gmem_o.store(out, ii);
+    }
+
+    // Move to the next part of the output.
+    gmem_o.move();
+
+    // Trigger the load for the next Q, K, V values.
+    if (q_loop + Cta_tile_p::M < q_loop_bound) {
+      if (USE_LDGSTS_KV) {
+        gmem_k.load(smem_k);
+        gmem_v.load(smem_v);
+      }
+
+      // Make sure we have the LDGDEPBAR in place.
+      fmha::ldgdepbar<USE_LDGSTS>();
+
+      // Commit the data for Q, K, V to shared memory.
+      gmem_q.commit(smem_q);
+
+      gmem_k.commit(smem_k);
+
+      if (!Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+        gmem_v.commit(smem_v);
+      }
+    }
+
+    // Make sure the data is in shared memory.
+    fmha::depbar<USE_LDGSTS, 1>();
+    __syncthreads();
+
+    // Make sure we are reading from the correct buffer.
+    if (USE_LDGSTS_Q) {
+      smem_q.move_to_next_read_buffer();
+    }
+
+// Trigger the loads for the values of Q for the next iteration.
+#pragma unroll
+    for (int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki) {
+      smem_q.load(frag_q[ki], ki);
+    }
+
+    if (USE_LDGSTS_K) {
+      smem_k.move_to_next_read_buffer();
+    }
+
+    // Trigger the loads for the values of K, V for the next iteration.
+    if (Kernel_traits::LIMIT_QK_FRAGMENTS) {
+      smem_k.load(frag_k[0], 0);
+    } else {
+#pragma unroll
+      for (int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki) {
+        smem_k.load(frag_k[ki], ki);
+      }
+    }
+
+    // Commit the data for V to shared memory if it has not been done already.
+    if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+      // Make sure we are done loading the fragments for K.
+      __syncthreads();
+
+      // Commit the data to shared memory for V.
+      gmem_v.commit(smem_v);
+
+      // Make sure the data is in shared memory.
+      __syncthreads();
+    }
+
+    if (USE_LDGSTS_V) {
+      smem_v.move_to_next_read_buffer();
+    }
+
+#pragma unroll
+    for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+      smem_v.load(frag_v[ki], ki);
+    }
+
+  }  // Outer loop over the query sequence length.
+
+}  // device_flash_attention_1xN
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fused_multihead_attention
diff --git a/csrc/fmha_v2/fused_multihead_flash_attention_kernel_noloop.h b/csrc/fmha_v2/fused_multihead_flash_attention_kernel_noloop.h
new file mode 100644
index 0000000000..c1586618b0
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_flash_attention_kernel_noloop.h
@@ -0,0 +1,645 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/gemm.h>
+#include <fmha/kernel_traits.h>
+#include <fused_multihead_attention_kernel.h>
+
+namespace fused_multihead_attention {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_traits, typename Params>
+inline __device__ void device_flash_attention_nl(Params const& params) {
+  // The instruction traits.
+  using Traits_p = typename Kernel_traits::Traits_p;
+  using Traits_o = typename Kernel_traits::Traits_o;
+
+  // The description of the CTA tile for the 1st batched GEMM.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  // The description of the CTA tile for the 2nd batched GEMM.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = typename Traits_p::template Mma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = typename Traits_o::template Mma_tile<Cta_tile_o>;
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
+  // The shared memory tile to swizzle Q.
+  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
+  // The shared memory tile to swizzle K.
+  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
+  // The shared memory tile to swizzle O.
+  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
+
+  // Do we use LDGSTS for Q, K or V?
+  enum { USE_LDGSTS_Q = Kernel_traits::USE_LDGSTS_Q };
+
+  enum { USE_LDGSTS_K = Kernel_traits::USE_LDGSTS_K };
+
+  enum { USE_LDGSTS_V = Kernel_traits::USE_LDGSTS_V };
+
+  // Do we use LDGSTS for any of the 3 input matrices.
+  enum { USE_LDGSTS = USE_LDGSTS_Q || USE_LDGSTS_K || USE_LDGSTS_V };
+
+  // If either K or V uses LDGSTS, they cannot share a buffer.
+  static_assert(!(USE_LDGSTS_K || USE_LDGSTS_V) || !Kernel_traits::SHARE_SMEM_FOR_K_AND_V, "");
+
+  // Fragment double buffer (reduce register pressure)
+  enum {
+    FRAGMENT_QK_SIZE_IN_K_DIM = (Kernel_traits::LIMIT_QK_FRAGMENTS) * 2 +
+                                !(Kernel_traits::LIMIT_QK_FRAGMENTS)*Mma_tile_p::MMAS_K
+  };
+
+  static_assert(!(Kernel_traits::LIMIT_QK_FRAGMENTS && USE_LDGSTS_K), "");
+  static_assert(!(Kernel_traits::SHARE_SMEM_FOR_K_AND_V &&
+                  (Kernel_traits::LIMIT_QK_FRAGMENTS || Kernel_traits::LIMIT_V_FRAGMENTS)),
+                "");
+
+  enum {
+    FRAGMENT_V_SIZE_IN_K_DIM = (Kernel_traits::LIMIT_V_FRAGMENTS) * 2 +
+                               !(Kernel_traits::LIMIT_V_FRAGMENTS)*Mma_tile_o::MMAS_K
+  };
+
+  // Shared memory.
+  extern __shared__ char smem_[];
+
+  // The loop -- each CTA works on a different loop iteration.
+  int const q_loop = blockIdx.x;
+  // The block index for the batch.
+  int const bidb = blockIdx.z;
+  // The block index for the head.
+  int const bidh = blockIdx.y;
+  // The thread index.
+  int const tidx = threadIdx.x;
+
+  // The block info.
+  Single_cta<Kernel_traits::VERSION> const binfo(params, bidb, bidh, 0, tidx);
+  int const q_sequence_start =
+      q_loop * Gmem_tile_q::ROWS + binfo.actual_kv_seqlen - binfo.actual_q_seqlen;
+  if (binfo.stop_early(q_loop * Gmem_tile_q::ROWS)) {
+    return;
+  }
+
+  // Create the object to control the masks.
+  fmha::Mask_dispatcher<Traits_p, Cta_tile_p, Kernel_traits::MASK_VERSION, Kernel_traits::IS_MTP>
+      mask(params, binfo, tidx);
+
+  // Allocate the global memory tile loader for Q.
+  // Gmem_tile_q gmem_q(params, 0, binfo, tidx);
+  Gmem_tile_q gmem_q(params, 0, binfo, tidx, q_loop * Gmem_tile_q::ROWS);
+  // Allocate the shared memory tile loader for Q.
+  Smem_tile_q smem_q(&smem_[0], tidx);
+
+  // Allocate the global memory tile loader for K.
+  Gmem_tile_k gmem_k(params, 1, binfo, tidx);
+  // Allocate the shared memory tile loader for K.
+  Smem_tile_k smem_k(&smem_[Smem_tile_q::BYTES_PER_TILE], tidx);
+
+  // Allocate the global memory tile loader for V.
+  Gmem_tile_v gmem_v(params, 2, binfo, tidx);
+
+  // The base pointer of smem_v;
+  char* smem_v_ = nullptr;
+  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    smem_v_ = &smem_[Smem_tile_q::BYTES_PER_TILE];
+  } else {
+    smem_v_ = &smem_[Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE];
+  }
+  // Allocate the shared memory tile loader for V. We use the same as K so be careful!!!
+  Smem_tile_v smem_v(smem_v_, tidx);
+
+  // Allocate the global memory tile loader for O.
+  // Gmem_tile_o gmem_o(params, binfo, tidx);
+  Gmem_tile_o gmem_o(params, binfo, tidx, q_loop * Gmem_tile_o::ROWS);
+  // Allocate the shared memory tile loader for O. We use the same as K so be careful!!!
+  Smem_tile_o smem_o(&smem_[Kernel_traits::NO_LOOP ? 0 : Smem_tile_q::BYTES_PER_TILE], tidx);
+
+  // With chunked attention, the q_start_seqlen might not be multiple of Cta_tile_p::M.
+  int const kv_mask_loop_start = int(q_sequence_start / Cta_tile_p::N) * Cta_tile_p::N;
+
+  // How many loops we need to apply mask.
+  constexpr int MASK_LOOPS = (Cta_tile_p::M + Cta_tile_p::N - 1) / Cta_tile_p::N;
+  static_assert(MASK_LOOPS * Cta_tile_p::N == Cta_tile_p::M || Cta_tile_p::N >= Cta_tile_p::M, "");
+
+  // The start/end step of kv loops.
+  // Do we need to mask out the tokens that is far away from the beginning.
+  bool const mask_sliding_window = Kernel_traits::SLIDING_WINDOW_ATTENTION &&
+                                   binfo.actual_kv_seqlen > params.sliding_window_size;
+  int const valid_seqlen = Kernel_traits::CAUSAL_MASK
+                               ? min(q_sequence_start + Cta_tile_p::M, binfo.actual_kv_seqlen)
+                               : binfo.actual_kv_seqlen;
+
+  int const kv_loop_end = ((valid_seqlen + Cta_tile_p::N - 1) / Cta_tile_p::N) * Cta_tile_p::N;
+  int const kv_loop_start =
+      mask_sliding_window
+          ? (max(0, q_sequence_start + 1 - params.sliding_window_size) / Cta_tile_p::N) *
+                Cta_tile_p::N
+          : 0;
+  int const sliding_window_mask_end =
+      mask_sliding_window ? (max(0, q_sequence_start + Cta_tile_p::M - params.sliding_window_size) /
+                             Cta_tile_p::N) *
+                                Cta_tile_p::N
+                          : 0;
+
+  static_assert(Cta_tile_p::M >= Cta_tile_p::N, "");
+
+  // Move K and V tiles.
+  // We need offset here since we split single k loops into finer granularity.
+  gmem_k.move_by_offset(kv_loop_start);
+  gmem_v.move_by_offset(kv_loop_start);
+
+  // Trigger the loads for Q.
+  gmem_q.load(smem_q);
+  // Trigger the loads for K.
+  gmem_k.load(smem_k);
+  // Trigger the loads for K.
+  gmem_v.load(smem_v);
+
+  // Push the LDGDEPBAR instruction after the loads for Q, K and V.
+  fmha::ldgdepbar<USE_LDGSTS>();
+
+  // Commit the data for Q and K to shared memory.
+  gmem_q.commit(smem_q);
+  gmem_k.commit(smem_k);
+
+  // Commit the data for V to shared memory.
+  if (!Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    gmem_v.commit(smem_v);
+  }
+
+  // Make sure the data is in shared memory.
+  fmha::depbar<USE_LDGSTS, 1>();
+  __syncthreads();
+
+  // Load the fragments for Q.
+  // NOTE: lds full frag_q as it will be used by all kv loops
+  typename Smem_tile_q::Fragment frag_q[FRAGMENT_QK_SIZE_IN_K_DIM][Mma_tile_p::MMAS_M];
+  if (Kernel_traits::LIMIT_QK_FRAGMENTS) {
+    smem_q.load(frag_q[0], 0);
+  } else {
+#pragma unroll
+    for (int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki) {
+      smem_q.load(frag_q[ki], ki);
+    }
+  }
+
+  // Load the fragments for K. We keep the data in registers during the entire kernel.
+  typename Smem_tile_k::Fragment frag_k[FRAGMENT_QK_SIZE_IN_K_DIM][Mma_tile_p::MMAS_N];
+  if (Kernel_traits::LIMIT_QK_FRAGMENTS) {
+    smem_k.load(frag_k[0], 0);
+  } else {
+#pragma unroll
+    for (int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki) {
+      smem_k.load(frag_k[ki], ki);
+    }
+  }
+
+  // Commit the data for V to shared memory if it has not been done already.
+  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    // Make sure we are done loading the fragments for K.
+    __syncthreads();
+
+    // Commit the data to shared memory for V.
+    gmem_v.commit(smem_v);
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+  }
+
+  // Load the fragments for V. We keep the data in registers during the entire kernel.
+  typename Smem_tile_v::Fragment frag_v[FRAGMENT_V_SIZE_IN_K_DIM][Mma_tile_o::VALID_MMAS_N];
+  if (Kernel_traits::LIMIT_V_FRAGMENTS) {
+    // v fragment load is deferred to after BMM1
+  } else {
+#pragma unroll
+    for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+      smem_v.load(frag_v[ki], ki);
+    }
+  }
+
+  // Store/load P to/from memory (for debugging).
+#if defined(STORE_P)
+  enum { BITS_PER_ELT_P = sizeof(typename Traits_p::Accumulator_type) * 8 };
+
+  using Gmem_tile_p = fmha::Gmem_tile_p<Traits_p, Cta_tile_p, BITS_PER_ELT_P>;
+  Gmem_tile_p gmem_p(params.p_ptr, params.p_stride_in_bytes, params.scale_bmm1, tidx,
+                     Kernel_traits::NO_LOOP ? q_loop * Cta_tile_p::M : 0);
+#endif
+
+  // Store S to memory (for debugging). NOTE: We use A_type as C_type is int32 for IMMA???
+#if defined(STORE_S)
+  enum { BITS_PER_ELT_S = sizeof(typename Traits_p::A_type) * 8 };
+
+  using Gmem_tile_s = fmha::Gmem_tile_s<Traits_p, Cta_tile_p, BITS_PER_ELT_S>;
+  Gmem_tile_s gmem_s(params.s_ptr, params.s_stride_in_bytes, params.scale_softmax, tidx,
+                     Kernel_traits::NO_LOOP ? q_loop * Cta_tile_p::M : 0);
+#endif
+
+  // Create the object to do the softmax.
+  using Softmax = fmha::Softmax<Traits_p, Cta_tile_p, Kernel_traits, Kernel_traits::SAGE_ATTENTION>;
+  Softmax softmax(params, &smem_[Smem_tile_q::BYTES_PER_TILE], bidb, tidx);
+  if constexpr (Kernel_traits::SAGE_ATTENTION) {
+    softmax.move_to_first_block(params, bidb, bidh, q_loop);
+  }
+
+  // Prefetch next kv buffer to share memory
+  enum {
+    PREFETCH_K_BUFFER_TO_SMEM = !Kernel_traits::LIMIT_QK_FRAGMENTS && !Softmax::USE_SHARED_MEMORY
+  };
+
+  enum { PREFETCH_V_BUFFER_TO_SMEM = !Kernel_traits::SHARE_SMEM_FOR_K_AND_V };
+
+  enum { PREFETCH_KV_BUFFER_TO_SMEM = PREFETCH_K_BUFFER_TO_SMEM && PREFETCH_V_BUFFER_TO_SMEM };
+
+  // The number of threads per row.
+  // enum { THREADS_PER_ROW = Cta_tile_p::WARPS_N * 8 };
+  // DEBUG.
+  // static_assert(THREADS_PER_ROW == 32, "");
+  // END OF DEBUG.
+  enum { THREADS_PER_ROW = 32 };
+
+  // Do we need to check if there are negative inf for softmax row_max ?
+  enum { CHECK_NEG_INF = Kernel_traits::SLIDING_WINDOW_ATTENTION || Kernel_traits::CUSTOM_MASK };
+
+  // Load the mask for that iteration.
+  mask.load(Kernel_traits::CUSTOM_MASK || Kernel_traits::IS_MTP ? q_loop * Gmem_tile_q::ROWS
+                                                                : q_sequence_start);
+
+  // Declare the accumulators for the 1st gemm.
+  fmha::Fragment_accumulator<Traits_o> acc_o[Mma_tile_o::MMAS_M][Mma_tile_o::VALID_MMAS_N];
+  using Acc_type_o = typename Traits_o::Accumulator_type;
+  fmha::Clear_accumulator<Acc_type_o, Cta_tile_o::WARPS_K>::apply(acc_o);
+
+  // Flash attention updater
+  fmha::Tile_o_normalizer<Traits_o, Cta_tile_o, Kernel_traits::SAGE_ATTENTION> acc_o_normalizer(
+      params, binfo);
+  if constexpr (Kernel_traits::SAGE_ATTENTION) {
+    acc_o_normalizer.move_to_first_block(params, bidb, bidh);
+  }
+
+  float global_max[Softmax::ROWS_PER_THREAD];
+  float global_sum[Softmax::ROWS_PER_THREAD];
+
+  for (int kv_loop = kv_loop_start; kv_loop < kv_loop_end; kv_loop += Cta_tile_p::N) {
+    bool const first_step = (kv_loop == kv_loop_start);
+    // It is possible that all tokens are masked out (sliding-window-attention).
+    bool const apply_sliding_window_mask =
+        (mask_sliding_window && kv_loop <= sliding_window_mask_end);
+    bool const apply_mask =
+        params.has_alibi || (kv_loop >= kv_mask_loop_start) || apply_sliding_window_mask;
+
+    // Move mask offset.
+    // Pre-load packed mask if it has.
+    mask.move_to_offset(kv_loop);
+
+    // Trigger the load for the next K/V values (smem_k double buffer).
+    if (kv_loop + Cta_tile_p::N < kv_loop_end && PREFETCH_KV_BUFFER_TO_SMEM) {
+      // Make sure we are done reading the data (smem).
+      if (!(Smem_tile_k::BUFFERS_PER_TILE > 1 && Smem_tile_v::BUFFERS_PER_TILE > 1)) {
+        __syncthreads();
+      }
+
+      gmem_k.move();
+      smem_k.move_to_next_write_buffer();
+      gmem_k.load(smem_k);
+
+      gmem_v.move();
+      smem_v.move_to_next_write_buffer();
+      gmem_v.load(smem_v);
+
+      // Push the LDGDEPBAR instruction after the loads for K.
+      fmha::ldgdepbar<USE_LDGSTS>();
+
+      gmem_k.commit(smem_k);
+      if (!Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+        gmem_v.commit(smem_v);
+      }
+    }
+
+    // Declare the accumulators for the 1st gemm.
+    fmha::Fragment_accumulator<Traits_p> acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
+    using Acc_type_p = typename Traits_p::Accumulator_type;
+    fmha::Clear_accumulator<Acc_type_p, Cta_tile_p::WARPS_K>::apply(acc_p);
+
+// Do this part of P^T = (Q * K^T)^T.
+#pragma unroll
+    for (int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki) {
+      int q_ki = (ki - 1);  // frag index for mma
+      int k_ki = (ki - 1);
+      if (Kernel_traits::LIMIT_QK_FRAGMENTS) {
+        q_ki = (ki - 1) & 1;
+        smem_q.load(frag_q[(ki & 1)], ki);
+      }
+      if (Kernel_traits::LIMIT_QK_FRAGMENTS) {
+        k_ki = (ki - 1) & 1;
+        smem_k.load(frag_k[(ki & 1)], ki);
+      }
+
+      // Do the math for the values already in registers.
+      if (ki <= Mma_tile_p::VALID_MMAS_K) {
+        fmha::gemm(acc_p, frag_q[q_ki], frag_k[k_ki]);
+      }
+    }
+
+    if (Mma_tile_p::MMAS_K <= Mma_tile_p::VALID_MMAS_K) {
+      int ki = Mma_tile_p::MMAS_K;
+      int k_ki = (ki - 1);
+      if (Kernel_traits::LIMIT_QK_FRAGMENTS) {
+        k_ki = (ki - 1) & 1;
+      }
+      fmha::gemm(acc_p, frag_q[k_ki], frag_k[k_ki]);
+    }
+
+    // Store the P matrix.
+#if defined(STORE_P)
+    gmem_p.store(acc_p);
+#endif
+
+    // Convert from the accumulator type to FP32 for Softmax.
+    softmax.unpack(acc_p);
+
+    // Apply the mask.
+    if (apply_mask) {
+      // Move mask offset.
+      mask.move_to_offset(kv_loop);
+
+      if (params.has_alibi) {
+        softmax.apply_mask_alibi(mask, bidh, params.alibi_params);
+      } else {
+        softmax.apply_mask(mask);
+      }
+    }
+
+    // Make sure we are done reading the data (smem_v).
+    if ((Kernel_traits::SHARE_SMEM_FOR_K_AND_V || Kernel_traits::LIMIT_QK_FRAGMENTS) &&
+        Softmax::USE_SHARED_MEMORY) {
+      __syncthreads();
+    }
+
+    // First step of the flash attention
+    if (first_step) {
+      // Compute the max.
+      softmax.template reduce<fmha::Max_>(global_max);
+
+      if (Softmax::USE_SHARED_MEMORY) {
+        // Make sure we are done reading shared memory.
+        __syncthreads();
+      }
+
+      // It is possible that all elts are -FLT_MAX with sliding_window_causal.
+      softmax.template apply_exp_with_mask<CHECK_NEG_INF>(global_max);
+      // Compute the sum.
+      softmax.template reduce<fmha::Sum_>(global_sum);
+    } else {
+      float tmp[Softmax::ROWS_PER_THREAD];
+
+#pragma unroll
+      for (int i = 0; i < Softmax::ROWS_PER_THREAD; i++) {
+        tmp[i] = global_max[i];
+      }
+
+      // Compute the max.
+      softmax.template reduce<fmha::Max_>(global_max);
+
+      if (Softmax::USE_SHARED_MEMORY) {
+        // Make sure we are done reading shared memory.
+        __syncthreads();
+      }
+
+      // Update last step's acc_o.
+      acc_o_normalizer.update(acc_o, global_max, tmp, global_sum);
+      // Apply expf of softmax.
+      // It is possible that all elts are -FLT_MAX with sliding_window_causal.
+      softmax.template apply_exp_with_mask<CHECK_NEG_INF>(global_max);
+
+// Update the global sum.
+#pragma unroll
+      for (int i = 0; i < Softmax::ROWS_PER_THREAD; i++) {
+        tmp[i] = global_sum[i];
+        global_sum[i] = 0.f;
+      }
+
+      // Compute the sum.
+      softmax.template reduce<fmha::Sum_>(global_sum);
+
+#pragma unroll
+      for (int i = 0; i < Softmax::ROWS_PER_THREAD; i++) {
+        global_sum[i] += tmp[i];
+      }
+    }
+
+    // Trigger the load for the next K/V values.
+    if (kv_loop + Cta_tile_p::N < kv_loop_end && !PREFETCH_KV_BUFFER_TO_SMEM) {
+      gmem_k.move();
+      gmem_v.move();
+      smem_k.move_to_next_write_buffer();
+      smem_v.move_to_next_write_buffer();
+      gmem_k.load(smem_k);
+      gmem_v.load(smem_v);
+
+      // Push the LDGDEPBAR instruction after the loads for Q, K and V.
+      fmha::ldgdepbar<USE_LDGSTS>();
+
+      if (!Kernel_traits::SHARE_SMEM_FOR_K_AND_V && Kernel_traits::LIMIT_QK_FRAGMENTS) {
+        // Make sure K is fully consumed before committing
+        __syncthreads();
+      }
+      gmem_k.commit(smem_k);
+      if (!Kernel_traits::SHARE_SMEM_FOR_K_AND_V && !Kernel_traits::LIMIT_V_FRAGMENTS) {
+        // Sync before write in case it is not synced before
+        if (!Kernel_traits::LIMIT_QK_FRAGMENTS) __syncthreads();
+        // Only commit after V is done loading, else defer committing until after O = P*V
+        gmem_v.commit(smem_v);
+      }
+    }
+
+    // Store the P matrix.
+#if defined(STORE_S)
+    softmax.store(gmem_s);
+#endif
+
+#if defined(STORE_P)
+    gmem_p.move_n();
+#endif
+
+#if defined(STORE_S)
+    gmem_s.move();
+#endif
+
+    // Prefetch LDS Q/K
+    if (kv_loop + Cta_tile_p::N < kv_loop_end) {
+      fmha::depbar<USE_LDGSTS, 1>();
+      __syncthreads();
+
+      if (Kernel_traits::LIMIT_QK_FRAGMENTS) {
+        smem_q.load(frag_q[0], 0);
+      }
+
+      smem_k.move_to_next_read_buffer();
+      if (Kernel_traits::LIMIT_QK_FRAGMENTS) {
+        smem_k.load(frag_k[0], 0);
+      } else {
+#pragma unroll
+        for (int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki) {
+          smem_k.load(frag_k[ki], ki);
+        }
+      }
+      if constexpr (Kernel_traits::SAGE_ATTENTION) {
+        softmax.move_to_next_block();
+      }
+    }
+
+    // Prefetch LDS V for this loop
+    if (Kernel_traits::LIMIT_V_FRAGMENTS) {
+      // Sync before read in case it was not sync before
+      if (kv_loop + Cta_tile_p::N >= kv_loop_end) __syncthreads();
+      smem_v.load(frag_v[0], 0);
+    }
+
+    // Repack for the next BMM.
+    fmha::Fragment_a<Traits_p, fmha::Row> frag_p[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
+    softmax.pack(frag_p);
+
+    // Do this part of O = P^T * V^T.
+    fmha::Fragment_accumulator<Traits_o>(
+        *acc_o_step)[Mma_tile_o::MMAS_M][Mma_tile_o::VALID_MMAS_N] = nullptr;
+    if constexpr (Kernel_traits::SAGE_ATTENTION) {
+      fmha::Fragment_accumulator<Traits_o> acc_o_temp[Mma_tile_o::MMAS_M][Mma_tile_o::VALID_MMAS_N];
+      acc_o_step = &acc_o_temp;
+      fmha::Clear_accumulator<Acc_type_o, Cta_tile_o::WARPS_K>::apply(*acc_o_step);
+    }
+#pragma unroll
+    for (int ki = 1; ki < Mma_tile_o::MMAS_K; ++ki) {
+      int p_ki = (ki - 1);  // frag index for mma
+      int v_ki = (ki - 1);
+      if (Kernel_traits::LIMIT_V_FRAGMENTS) {
+        v_ki = (ki - 1) & 1;
+        smem_v.load(frag_v[(ki & 1)], ki);
+      }
+      if constexpr (Kernel_traits::SAGE_ATTENTION) {
+        fmha::gemm(*acc_o_step, frag_p[p_ki], frag_v[v_ki]);
+      } else {
+        fmha::gemm(acc_o, frag_p[p_ki], frag_v[v_ki]);
+      }
+    }
+    {
+      int ki = Mma_tile_o::MMAS_K;
+      int p_ki = (ki - 1);
+      int v_ki = (ki - 1);
+      if (Kernel_traits::LIMIT_V_FRAGMENTS) {
+        v_ki = (ki - 1) & 1;
+      }
+      if constexpr (Kernel_traits::SAGE_ATTENTION) {
+        fmha::gemm(*acc_o_step, frag_p[p_ki], frag_v[v_ki]);
+      } else {
+        fmha::gemm(acc_o, frag_p[p_ki], frag_v[v_ki]);
+      }
+    }
+
+    if constexpr (Kernel_traits::SAGE_ATTENTION) {
+      acc_o_normalizer.apply_scale(*acc_o_step);
+      acc_o_normalizer.merge(acc_o, *acc_o_step);
+    }
+
+    // Commit V to shared memory
+    if (!Kernel_traits::SHARE_SMEM_FOR_K_AND_V && Kernel_traits::LIMIT_V_FRAGMENTS) {
+      __syncthreads();
+      gmem_v.commit(smem_v);
+    }
+
+    // Prefetch LDS V for next loop
+    if (kv_loop + Cta_tile_p::N < kv_loop_end) {
+      // Commit the data for V to shared memory if it has not been done already.
+      if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+        // Make sure we are done loading the fragments for K.
+        __syncthreads();
+
+        // Commit the data to shared memory for V.
+        gmem_v.commit(smem_v);
+
+        // Make sure the data is in shared memory.
+        __syncthreads();
+      }
+
+      smem_v.move_next_read_buffer();
+
+      if (Kernel_traits::LIMIT_V_FRAGMENTS) {
+        // To avoid excessive spill, prefetch LDS V is delayed until after 1st GEMM
+      } else {
+#pragma unroll
+        for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+          smem_v.load(frag_v[ki], ki);
+        }
+      }
+
+      if constexpr (Kernel_traits::SAGE_ATTENTION) {
+        acc_o_normalizer.move_to_next_block();
+      }
+    }
+  }  // Inner loop over the key/value sequence length.
+
+  // Update the sum if attention sinks are used.
+  acc_o_normalizer.update_sum(global_max, global_sum);
+  // Update acc_o of flash attention
+  acc_o_normalizer.final_update(acc_o, global_sum);
+
+  // Wait for last round of LDS K/V to finish
+  __syncthreads();
+
+// Loop over MMAS_M.
+#pragma unroll
+  for (int ii = 0; ii < Gmem_tile_o::LOOPS; ++ii) {
+    // Swizzle the elements and do the final reduction.
+    smem_o.store(acc_o, ii);
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+
+    // Load from shared memory.
+    uint4 out[Gmem_tile_o::STGS_PER_LOOP];
+    smem_o.load(out);
+
+    // Make sure the data was read from shared memory.
+    if (ii < Gmem_tile_o::LOOPS - 1) {
+      __syncthreads();
+    }
+
+    // Output the values.
+    gmem_o.store(out, ii);
+  }
+
+  if (params.softmax_stats_ptr != nullptr) {
+    using Mma_tile = typename Traits_p::template Mma_tile<Cta_tile_o>;
+    fmha::Softmax_saver<Cta_tile_o, Mma_tile> saver(params, binfo);
+    saver.store(q_loop, global_sum, global_max);
+  }
+}  // device_flash_attention_1xN
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fused_multihead_attention
diff --git a/csrc/fmha_v2/fused_multihead_flash_attention_kernel_noloop_tiled.h b/csrc/fmha_v2/fused_multihead_flash_attention_kernel_noloop_tiled.h
new file mode 100644
index 0000000000..906232dd46
--- /dev/null
+++ b/csrc/fmha_v2/fused_multihead_flash_attention_kernel_noloop_tiled.h
@@ -0,0 +1,577 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include <fmha/gemm.h>
+#include <fmha/kernel_traits.h>
+#include <fmha/utils.h>
+#include <fused_multihead_attention_kernel.h>
+
+namespace fused_multihead_attention {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_traits, typename Params>
+inline __device__ void device_flash_attention_nl_tiled(Params const& params) {
+  // The instruction traits.
+  using Traits_p = typename Kernel_traits::Traits_p;
+  using Traits_o = typename Kernel_traits::Traits_o;
+
+  // The description of the CTA tile for the 1st batched GEMM.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  // The description of the CTA tile for the 2nd batched GEMM.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = typename Traits_p::template Mma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = typename Traits_o::template Mma_tile<Cta_tile_o>;
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
+  // The shared memory tile to swizzle Q.
+  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
+  // The shared memory tile to swizzle K.
+  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
+  // The shared memory tile to swizzle O.
+  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
+
+  // Do we use LDGSTS for Q, K or V?
+  enum { USE_LDGSTS_Q = Kernel_traits::USE_LDGSTS_Q };
+
+  enum { USE_LDGSTS_K = Kernel_traits::USE_LDGSTS_K };
+
+  enum { USE_LDGSTS_V = Kernel_traits::USE_LDGSTS_V };
+
+  // Do we use LDGSTS for any of the 3 input matrices.
+  enum { USE_LDGSTS = USE_LDGSTS_Q || USE_LDGSTS_K || USE_LDGSTS_V };
+
+  // TODO ANT: assertions
+  static_assert(USE_LDGSTS, "Supports only USE_LDGSTS = true");
+  static_assert(!Kernel_traits::SHARE_SMEM_FOR_K_AND_V, "");
+
+  // If either K or V uses LDGSTS, they cannot share a buffer.
+  static_assert(!(USE_LDGSTS_K || USE_LDGSTS_V) || !Kernel_traits::SHARE_SMEM_FOR_K_AND_V, "");
+
+  // Fragment double buffer (reduce register pressure)
+  enum {
+    FRAGMENT_QK_SIZE_IN_K_DIM = (Kernel_traits::LIMIT_QK_FRAGMENTS) * 2 +
+                                !(Kernel_traits::LIMIT_QK_FRAGMENTS)*Mma_tile_p::MMAS_K
+  };
+
+  static_assert(!(Kernel_traits::SHARE_SMEM_FOR_K_AND_V &&
+                  (Kernel_traits::LIMIT_QK_FRAGMENTS || Kernel_traits::LIMIT_V_FRAGMENTS)),
+                "");
+
+  enum {
+    FRAGMENT_V_SIZE_IN_K_DIM = (Kernel_traits::LIMIT_V_FRAGMENTS) * 2 +
+                               !(Kernel_traits::LIMIT_V_FRAGMENTS)*Mma_tile_o::MMAS_K
+  };
+
+  // Do we need to check if there are negative inf for softmax row_max ?
+  enum { CHECK_NEG_INF = Kernel_traits::SLIDING_WINDOW_ATTENTION || Kernel_traits::CUSTOM_MASK };
+
+  // Shared memory.
+  extern __shared__ char smem_[];
+
+  // The loop -- each CTA works on a different loop iteration.
+  int const ctas_per_o_row = (Cta_tile_o::VALID_N + Cta_tile_o::N - 1) / Cta_tile_o::N;
+  int const q_loop = blockIdx.x / ctas_per_o_row;
+  // Each CTA works on a specific row partition of O.
+  int const o_part = blockIdx.x % ctas_per_o_row;
+  // The block index for the batch.
+  int const bidb = blockIdx.z;
+  // The block index for the head.
+  int const bidh = blockIdx.y;
+  // The thread index.
+  int const tidx = threadIdx.x;
+
+  // The block info.
+  Single_cta<Kernel_traits::VERSION> const binfo(params, bidb, bidh, 0, tidx);
+  // The local sequence offset of Q.
+  int q_sequence_start = Kernel_traits::IS_MTP
+                             ? (q_loop * Gmem_tile_q::ROWS) / params.num_grouped_heads
+                             : (q_loop * Gmem_tile_q::ROWS);
+  // Consider the past sequence length.
+  q_sequence_start += binfo.actual_kv_seqlen - binfo.actual_q_seqlen;
+  if (binfo.stop_early(q_loop * Gmem_tile_q::ROWS)) {
+    return;
+  }
+
+  // Create the object to control the masks.
+  fmha::Mask_dispatcher<Traits_p, Cta_tile_p, Kernel_traits::MASK_VERSION, Kernel_traits::IS_MTP>
+      mask(params, binfo, tidx);
+
+  // Allocate the global memory tile loader for Q.
+  Gmem_tile_q gmem_q(params, 0, binfo, tidx, q_loop * Gmem_tile_q::ROWS);
+  // Allocate the shared memory tile loader for Q.
+  Smem_tile_q smem_q(&smem_[0], tidx);
+
+  // Allocate the global memory tile loader for K.
+  Gmem_tile_k gmem_k(params, 1, binfo, tidx);
+  // Allocate the shared memory tile loader for K.
+  Smem_tile_k smem_k(&smem_[Smem_tile_q::BYTES_PER_TILE], tidx);
+
+  // Allocate the global memory tile loader for V.
+  Gmem_tile_v gmem_v(params, 2, binfo, tidx, 0,
+                     o_part * Cta_tile_o::N * Gmem_tile_v::BYTES_PER_ELEMENT);
+  // Allocate the shared memory tile loader for V.
+  Smem_tile_v smem_v(&smem_[Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE], tidx);
+
+  // Allocate the global memory tile loader for O.
+  Gmem_tile_o gmem_o(params, binfo, tidx, q_loop * Gmem_tile_o::ROWS,
+                     o_part * Cta_tile_o::N * Gmem_tile_o::BYTES_PER_ELEMENT);
+  // Allocate the shared memory tile loader for O. We use the same as K so be careful!!!
+  Smem_tile_o smem_o(&smem_[Kernel_traits::NO_LOOP ? 0 : Smem_tile_q::BYTES_PER_TILE], tidx);
+
+  // With chunked attention, the q_start_seqlen might not be multiple of Cta_tile_p::M.
+  int const kv_mask_loop_start = int(q_sequence_start / Cta_tile_p::N) * Cta_tile_p::N;
+
+  // The start/end step of kv loops.
+  // Do we need to mask out the tokens that is not in the sliding window.
+  bool const mask_sliding_window = Kernel_traits::SLIDING_WINDOW_ATTENTION &&
+                                   binfo.actual_kv_seqlen > params.sliding_window_size;
+  int const valid_seqlen = Kernel_traits::CAUSAL_MASK
+                               ? min(q_sequence_start + Cta_tile_p::M, binfo.actual_kv_seqlen)
+                               : binfo.actual_kv_seqlen;
+
+  int const kv_loop_end = ((valid_seqlen + Cta_tile_p::N - 1) / Cta_tile_p::N) * Cta_tile_p::N;
+  int const kv_loop_start =
+      mask_sliding_window
+          ? (max(0, q_sequence_start + 1 - params.sliding_window_size) / Cta_tile_p::N) *
+                Cta_tile_p::N
+          : 0;
+  int const sliding_window_mask_end =
+      mask_sliding_window ? (max(0, q_sequence_start + Cta_tile_p::M - params.sliding_window_size) /
+                             Cta_tile_p::N) *
+                                Cta_tile_p::N
+                          : 0;
+
+  // Move K and V tiles.
+  // We need offset here since we split single k loops into finer granularity.
+  gmem_k.move_by_offset(kv_loop_start);
+  gmem_v.move_by_offset(kv_loop_start);
+
+  // Trigger the loads for Q.
+  gmem_q.load(smem_q);
+  // Trigger the loads for K.
+  gmem_k.load(smem_k);
+
+  // Push the LDGDEPBAR instruction after the loads for Q, K
+  fmha::ldgdepbar<USE_LDGSTS>();
+
+  // Commit the data for Q and K to shared memory.
+  gmem_q.commit(smem_q);
+  gmem_k.commit(smem_k);
+
+  // Declare the fragments for Q/K/V.
+  typename Smem_tile_q::Fragment frag_q[Mma_tile_p::MMAS_K][Mma_tile_p::MMAS_M];
+  typename Smem_tile_k::Fragment frag_k[Mma_tile_p::MMAS_K][Mma_tile_p::MMAS_N];
+  typename Smem_tile_v::Fragment frag_v[Mma_tile_o::MMAS_K][Mma_tile_o::VALID_MMAS_N];
+
+  // Store/load P to/from memory (for debugging).
+#if defined(STORE_P)
+  enum { BITS_PER_ELT_P = sizeof(typename Traits_p::Accumulator_type) * 8 };
+
+  using Gmem_tile_p = fmha::Gmem_tile_p<Traits_p, Cta_tile_p, BITS_PER_ELT_P>;
+  Gmem_tile_p gmem_p(params.p_ptr, params.p_stride_in_bytes, params.scale_bmm1, tidx,
+                     Kernel_traits::NO_LOOP ? q_loop * Cta_tile_p::M : 0);
+#endif
+
+  // Store S to memory (for debugging). NOTE: We use A_type as C_type is int32 for IMMA???
+#if defined(STORE_S)
+  enum { BITS_PER_ELT_S = sizeof(typename Traits_p::A_type) * 8 };
+
+  using Gmem_tile_s = fmha::Gmem_tile_s<Traits_p, Cta_tile_p, BITS_PER_ELT_S>;
+  Gmem_tile_s gmem_s(params.s_ptr, params.s_stride_in_bytes, params.scale_softmax, tidx,
+                     Kernel_traits::NO_LOOP ? q_loop * Cta_tile_p::M : 0);
+#endif
+
+  // Create the object to do the softmax.
+  using Softmax = fmha::Softmax<Traits_p, Cta_tile_p, Kernel_traits>;
+  Softmax softmax(params, &smem_[Smem_tile_q::BYTES_PER_TILE], bidb, tidx);
+
+  static_assert(!Softmax::USE_SHARED_MEMORY, "");
+
+  // Prefetch next kv buffer to share memory
+  enum {
+    PREFETCH_K_BUFFER_TO_SMEM = !Kernel_traits::LIMIT_QK_FRAGMENTS && !Softmax::USE_SHARED_MEMORY
+  };
+
+  enum { PREFETCH_V_BUFFER_TO_SMEM = !Kernel_traits::SHARE_SMEM_FOR_K_AND_V };
+
+  enum { PREFETCH_KV_BUFFER_TO_SMEM = PREFETCH_K_BUFFER_TO_SMEM && PREFETCH_V_BUFFER_TO_SMEM };
+
+  // The number of threads per row.
+  enum { THREADS_PER_ROW = 32 };
+
+  // Load the mask for that iteration.
+  mask.load(Kernel_traits::CUSTOM_MASK || Kernel_traits::IS_MTP ? q_loop * Gmem_tile_q::ROWS
+                                                                : q_sequence_start);
+
+  // Declare the accumulators for the 2nd gemm.
+  fmha::Fragment_accumulator<Traits_o> acc_o[Mma_tile_o::MMAS_M][Mma_tile_o::VALID_MMAS_N];
+  using Acc_type_o = typename Traits_o::Accumulator_type;
+  fmha::Clear_accumulator<Acc_type_o, Cta_tile_o::WARPS_K>::apply(acc_o);
+
+  // Flash attention updater
+  fmha::Tile_o_normalizer<Traits_o, Cta_tile_o> acc_o_normalizer(params, binfo);
+  float global_max[Softmax::ROWS_PER_THREAD];
+  float global_sum[Softmax::ROWS_PER_THREAD];
+
+  // BMM1_MAIN_MMAS_K_BOUND is the number of MMAs in the K dimension to execute in the loop and
+  // BMM1_TAIL_MMAS_K_BOUND is the number of MMAs in the K dimension in the residual of the loop.
+  //
+  // When VALID_MMAS_K is a multiple of MMAS_K, we compute:
+  //
+  //   BMM1_MAIN_MMAS_K_BOUND = VALID_MMAS_K - MMAS_K
+  //   BMM1_TAIL_MMAS_K_BOUND = MMAS_K
+  //
+  // When VALID_MASK_K is not a multiple of MMAS_K, we compute:
+  //
+  //   BMM1_MAIN_MMAS_K_BOUND = VALID_MMAS_K / MMAS_K * MMAS_K
+  //   BMM1_TAIL_MMAS_K_BOUND = VALID_MMAS_K % MMAS_K
+  constexpr int BMM1_VALID_MMAS_K = Mma_tile_p::VALID_MMAS_K;
+  constexpr int BMM1_TAIL_MMAS_K_BOUND = BMM1_VALID_MMAS_K % Mma_tile_p::MMAS_K
+                                             ? BMM1_VALID_MMAS_K % Mma_tile_p::MMAS_K
+                                             : Mma_tile_p::MMAS_K;
+  constexpr int BMM1_MAIN_MMAS_K_BOUND = BMM1_VALID_MMAS_K - BMM1_TAIL_MMAS_K_BOUND;
+
+  // BMM2_MAIN_MMAS_K_BOUND is the number of MMAs in the K dimension to execute in the loop and
+  // BMM2_TAIL_MMAS_K_BOUND is the number of MMAs in the K dimension in the residual of the loop.
+  constexpr int BMM2_TAIL_MMAS_K_BOUND = Mma_tile_o::MMAS_K;
+  constexpr int BMM2_MAIN_MMAS_K_BOUND = Kernel_traits::TOTAL_BMM2_MMAS_K - BMM2_TAIL_MMAS_K_BOUND;
+
+  for (int kv_loop = kv_loop_start; kv_loop < kv_loop_end; kv_loop += Cta_tile_p::N) {
+    bool const first_step = (kv_loop == kv_loop_start);
+    // It is possible that all tokens are masked out (sliding-window-attention).
+    bool const apply_sliding_window_mask =
+        (mask_sliding_window && kv_loop <= sliding_window_mask_end);
+    bool const apply_mask =
+        params.has_alibi || (kv_loop >= kv_mask_loop_start) || apply_sliding_window_mask;
+
+    // Declare the accumulators for the 1st gemm.
+    fmha::Fragment_accumulator<Traits_p> acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
+    using Acc_type_p = typename Traits_p::Accumulator_type;
+    fmha::Clear_accumulator<Acc_type_p, Cta_tile_p::WARPS_K>::apply(acc_p);
+
+    // Move mask offset.
+    // Pre-load packed mask if it has.
+    mask.move_to_offset(kv_loop);
+
+    // BMM1 main loop
+    // MMAS_K is the padded D, VALID_MMAS_K is D length
+    // now MMAS_K is tiling size in D, VALID_MMAS_K is D length
+    for (int bmm1_k = 0; bmm1_k < BMM1_MAIN_MMAS_K_BOUND; bmm1_k += Mma_tile_p::MMAS_K) {
+      // Trigger the load for the next Q/K values
+      if (Kernel_traits::RELOAD_Q || first_step) {
+        gmem_q.move_col();
+        smem_q.move_to_next_write_buffer();
+        gmem_q.load(smem_q);
+      }
+
+      gmem_k.move_col();
+      smem_k.move_to_next_write_buffer();
+      gmem_k.load(smem_k);
+
+      // Push the LDGDEPBAR instruction after the loads for QK.
+      fmha::ldgdepbar<USE_LDGSTS>();
+
+      if (Kernel_traits::RELOAD_Q || first_step) {
+        gmem_q.commit(smem_q);
+      }
+      gmem_k.commit(smem_k);
+
+      // Double SMEM buffer: make sure we are done writing the data from last stage (smem).
+      // Leave 1 outstanding batch loadings
+      fmha::depbar_<USE_LDGSTS, 1>();
+      __syncthreads();
+
+// Do this part of P^T = (Q * K^T)^T.
+#pragma unroll
+      for (int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki) {
+        // Load the fragments for Q/K inner tile
+        smem_q.load(frag_q[ki], ki);
+        smem_k.load(frag_k[ki], ki);
+
+        // Do the math for the values already in registers.
+        fmha::gemm(acc_p, frag_q[ki], frag_k[ki]);
+      }
+
+      // Make sure we are done reading the data (smem).
+      __syncthreads();
+
+      smem_q.move_to_next_read_buffer();
+      smem_k.move_to_next_read_buffer();
+    }
+
+    // BMM1 tail loop
+    {
+      // Trigger the load for next V values
+      if (kv_loop > kv_loop_start) {
+        gmem_v.move();
+        smem_v.move_to_next_write_buffer();
+      }
+      gmem_v.load(smem_v);
+
+      fmha::ldgdepbar<USE_LDGSTS>();
+
+      gmem_v.commit(smem_v);
+
+      // Wait for Q/K to finish loading
+      fmha::depbar_<USE_LDGSTS, 1>();
+      __syncthreads();
+
+// Do this part of P^T = (Q * K^T)^T.
+#pragma unroll
+      for (int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki) {
+        // Load the fragments for Q/K inner tile
+        smem_q.load(frag_q[ki], ki);
+        smem_k.load(frag_k[ki], ki);
+
+        // Do the math for the values already in registers.
+        if (Cta_tile_p::VALID_K % Cta_tile_p::K == 0 || ki < BMM1_TAIL_MMAS_K_BOUND) {
+          fmha::gemm(acc_p, frag_q[ki], frag_k[ki]);
+        }
+      }
+      smem_q.move_to_next_read_buffer();
+      smem_k.move_to_next_read_buffer();
+    }  // end BMM1 tail loop
+
+    // Store the P matrix.
+#if defined(STORE_P)
+    gmem_p.store(acc_p);
+#endif
+
+    // Convert from the accumulator type to FP32 for Softmax.
+    softmax.unpack(acc_p);
+
+    // Apply the mask.
+    if (apply_mask) {
+      if (params.has_alibi) {
+        softmax.apply_mask_alibi(mask, bidh, params.alibi_params);
+      } else {
+        softmax.apply_mask(mask);
+      }
+    }
+
+    // Make sure we are done reading the data (smem_v).
+    if ((Kernel_traits::SHARE_SMEM_FOR_K_AND_V || Kernel_traits::LIMIT_QK_FRAGMENTS) &&
+        Softmax::USE_SHARED_MEMORY) {
+      __syncthreads();
+    }
+
+    // First step of the flash attention
+    if (first_step) {
+      // Compute the max.
+      softmax.template reduce<fmha::Max_>(global_max);
+
+      if (Softmax::USE_SHARED_MEMORY) {
+        // Make sure we are done reading shared memory.
+        __syncthreads();
+      }
+
+      // It is possible that all elts are -FLT_MAX with sliding_window_causal or custom_mask.
+      softmax.template apply_exp_with_mask<CHECK_NEG_INF>(global_max);
+
+      // Compute the sum.
+      softmax.template reduce<fmha::Sum_>(global_sum);
+    } else {
+      float tmp[Softmax::ROWS_PER_THREAD];
+
+#pragma unroll
+      for (int i = 0; i < Softmax::ROWS_PER_THREAD; i++) {
+        tmp[i] = global_max[i];
+      }
+
+      // Compute the max.
+      softmax.template reduce<fmha::Max_>(global_max);
+
+      if (Softmax::USE_SHARED_MEMORY) {
+        // Make sure we are done reading shared memory.
+        __syncthreads();
+      }
+
+      // Update last step's acc_o.
+      acc_o_normalizer.update(acc_o, global_max, tmp, global_sum);
+      // Apply expf of softmax.
+      // It is possible that all elts are -FLT_MAX with sliding_window_causal or custom_mask.
+      softmax.template apply_exp_with_mask<CHECK_NEG_INF>(global_max);
+
+// Update the global sum.
+// TODO Can we just zero out tmp and reduce into that
+#pragma unroll
+      for (int i = 0; i < Softmax::ROWS_PER_THREAD; i++) {
+        tmp[i] = global_sum[i];
+        global_sum[i] = 0.f;
+      }
+
+      // Compute the sum.
+      softmax.template reduce<fmha::Sum_>(global_sum);
+
+#pragma unroll
+      for (int i = 0; i < Softmax::ROWS_PER_THREAD; i++) {
+        global_sum[i] += tmp[i];
+      }
+    }
+
+    // Store the P matrix.
+#if defined(STORE_S)
+    softmax.store(gmem_s);
+#endif
+
+#if defined(STORE_P)
+    gmem_p.move_n();
+#endif
+
+#if defined(STORE_S)
+    gmem_s.move();
+#endif
+    // Repack for the next BMM.
+    fmha::Fragment_a<Traits_p, fmha::Row> frag_p[Kernel_traits::TOTAL_BMM2_MMAS_K]
+                                                [Mma_tile_o::MMAS_M];
+    softmax.pack(frag_p);
+
+    // BMM2 main loop
+    for (int bmm2_k = 0; bmm2_k < BMM2_MAIN_MMAS_K_BOUND; bmm2_k += Mma_tile_o::MMAS_K) {
+      // Trigger the load for next V values
+      gmem_v.move();
+      smem_v.move_to_next_write_buffer();
+      gmem_v.load(smem_v);
+
+      // Push the LDGDEPBAR instruction after the loads for K.
+      fmha::ldgdepbar<USE_LDGSTS>();
+      gmem_v.commit(smem_v);
+
+      // Leave 1 outstanding batch loadings
+      fmha::depbar_<USE_LDGSTS, 1>();
+      __syncthreads();
+
+#pragma unroll
+      for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+        int p_ki = bmm2_k + ki;
+        smem_v.load(frag_v[ki], ki);
+        fmha::gemm(acc_o, frag_p[p_ki], frag_v[ki]);
+      }
+
+      // Make sure we are done reading the data (smem).
+      __syncthreads();
+
+      smem_v.move_to_next_read_buffer();
+    }
+
+    // BMM2 tail loop
+    {
+      if (kv_loop + Cta_tile_p::N < kv_loop_end) {
+        // Trigger the load for the next Q/K values
+        // Advance K rows and rewind QK cols
+        int col_steps = fmha::div_up((int)Kernel_traits::VALID_D, (int)Kernel_traits::CTA_P_TILE_K);
+        // Enforce warp-uniform value. Removing this one line can contribute to 10% perf drop
+        // due to register spills
+        col_steps = __shfl_sync(0xffffffff, col_steps, 0);
+
+        if constexpr (Kernel_traits::RELOAD_Q) {
+          gmem_q.move_col();
+          gmem_q.rewind_col(col_steps);
+          smem_q.move_to_next_write_buffer();
+          gmem_q.load(smem_q);
+        }
+        gmem_k.move();
+        gmem_k.move_col();
+        gmem_k.rewind_col(col_steps);
+        smem_k.move_to_next_write_buffer();
+        gmem_k.load(smem_k);
+
+        // Push the LDGDEPBAR instruction after the loads for K.
+        fmha::ldgdepbar<USE_LDGSTS>();
+
+        if constexpr (Kernel_traits::RELOAD_Q) {
+          gmem_q.commit(smem_q);
+        }
+        gmem_k.commit(smem_k);
+
+        // Wait for V to finish loading
+        fmha::depbar_<USE_LDGSTS, 1>();
+        __syncthreads();
+      } else {
+        fmha::depbar_<USE_LDGSTS, 0>();
+        __syncthreads();
+      }
+
+// Do this part of O = P^T * V^T.
+#pragma unroll
+      for (int ki = 0; ki < BMM2_TAIL_MMAS_K_BOUND; ++ki) {
+        int p_ki = BMM2_MAIN_MMAS_K_BOUND + ki;
+        smem_v.load(frag_v[ki], ki);
+        fmha::gemm(acc_o, frag_p[p_ki], frag_v[ki]);
+      }
+
+      smem_v.move_to_next_read_buffer();
+    }  // end BMM2 tail loop
+
+  }  // Inner loop over the key/value sequence length.
+
+  // Update the sum if attention sinks are used.
+  acc_o_normalizer.update_sum(global_max, global_sum);
+  // Update acc_o of flash attention
+  acc_o_normalizer.final_update(acc_o, global_sum);
+
+  // If kv_loop breaks prematurely in case of causal masking, make sure there is no data in-flight
+  if (Kernel_traits::CAUSAL_MASK) {
+    fmha::depbar_<USE_LDGSTS, 0>();
+  }
+  // Wait for last round of LDS K/V to finish
+  __syncthreads();
+
+// Loop over MMAS_M.
+#pragma unroll
+  for (int ii = 0; ii < Gmem_tile_o::LOOPS; ++ii) {
+    // Swizzle the elements and do the final reduction.
+    smem_o.store(acc_o, ii);
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+
+    // Load from shared memory.
+    uint4 out[Gmem_tile_o::STGS_PER_LOOP];
+    smem_o.load(out);
+
+    // Make sure the data was read from shared memory.
+    if (ii < Gmem_tile_o::LOOPS - 1) {
+      __syncthreads();
+    }
+
+    // Output the values.
+    gmem_o.store(out, ii);
+  }
+  if (params.softmax_stats_ptr != nullptr) {
+    using Mma_tile = typename Traits_p::template Mma_tile<Cta_tile_o>;
+    fmha::Softmax_saver<Cta_tile_o, Mma_tile> saver(params, binfo);
+    saver.store(q_loop, global_sum, global_max);
+  }
+}  // device_flash_attention_1xN
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fused_multihead_attention
diff --git a/csrc/fmha_v2/softmax_bf16.cu b/csrc/fmha_v2/softmax_bf16.cu
new file mode 100644
index 0000000000..5687c4d70c
--- /dev/null
+++ b/csrc/fmha_v2/softmax_bf16.cu
@@ -0,0 +1,21 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include "softmax_impl.h"
+
+void run_softmax_bf16(void* dst, void const* src, void const* mask, void const* attention_sinks,
+                      void* softmax_sum_d, void* cu_q_seqlens_d, int s_inner, int s_outer, int b,
+                      int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi) {
+  run_softmax<fmha::bf16_t, float>(dst, src, mask, attention_sinks, softmax_sum_d, cu_q_seqlens_d,
+                                   s_inner, s_outer, b, h, 0.f, 0.f, softcapping_scale_bmm1,
+                                   warps_n, has_alibi);
+}
diff --git a/csrc/fmha_v2/softmax_fp16.cu b/csrc/fmha_v2/softmax_fp16.cu
new file mode 100644
index 0000000000..63ce2898a5
--- /dev/null
+++ b/csrc/fmha_v2/softmax_fp16.cu
@@ -0,0 +1,21 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include "softmax_impl.h"
+
+void run_softmax_fp16(void* dst, void const* src, void const* mask, void const* attention_sinks,
+                      void* softmax_sum_d, void* cu_q_seqlens_d, int s_inner, int s_outer, int b,
+                      int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi) {
+  run_softmax<uint16_t, uint16_t>(dst, src, mask, attention_sinks, softmax_sum_d, cu_q_seqlens_d,
+                                  s_inner, s_outer, b, h, 0.f, 0.f, softcapping_scale_bmm1, warps_n,
+                                  has_alibi);
+}
diff --git a/csrc/fmha_v2/softmax_fp32.cu b/csrc/fmha_v2/softmax_fp32.cu
new file mode 100644
index 0000000000..deef93a4ee
--- /dev/null
+++ b/csrc/fmha_v2/softmax_fp32.cu
@@ -0,0 +1,21 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include "softmax_impl.h"
+
+void run_softmax_fp32(void* dst, void const* src, void const* mask, void const* attention_sinks,
+                      void* softmax_sum_d, void* cu_q_seqlens_d, int s_inner, int s_outer, int b,
+                      int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi) {
+  run_softmax<fmha::fp16_t, float>(dst, src, mask, attention_sinks, softmax_sum_d, cu_q_seqlens_d,
+                                   s_inner, s_outer, b, h, 0.f, 0.f, softcapping_scale_bmm1,
+                                   warps_n, has_alibi);
+}
diff --git a/csrc/fmha_v2/softmax_fp8.cu b/csrc/fmha_v2/softmax_fp8.cu
new file mode 100644
index 0000000000..e7fcd91526
--- /dev/null
+++ b/csrc/fmha_v2/softmax_fp8.cu
@@ -0,0 +1,22 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include "softmax_impl.h"
+
+void run_softmax_e4m3(void* dst, void const* src, void const* mask, void const* attention_sinks,
+                      void* softmax_sum_d, void* cu_q_seqlens_d, int s_inner, int s_outer, int b,
+                      int h, float scale_softmax, float softcapping_scale_bmm1, int warps_n,
+                      bool has_alibi) {
+  run_softmax<fmha::e4m3_t, float>(dst, src, mask, attention_sinks, softmax_sum_d, cu_q_seqlens_d,
+                                   s_inner, s_outer, b, h, 0.f, scale_softmax,
+                                   softcapping_scale_bmm1, warps_n, has_alibi);
+}
diff --git a/csrc/fmha_v2/softmax_impl.h b/csrc/fmha_v2/softmax_impl.h
new file mode 100644
index 0000000000..0d991b704c
--- /dev/null
+++ b/csrc/fmha_v2/softmax_impl.h
@@ -0,0 +1,1004 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <fmha/numeric_types.h>
+#include <fmha/utils.h>
+
+#include <cfloat>
+#include <cstdio>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The number of threads per warp.
+enum { THREADS_PER_WARP = 32 };
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Dst_type, typename Src_type>
+struct Softmax_params {
+  // Output pointer.
+  Dst_type* dst;
+  // Source pointer.
+  Src_type const* src;
+  // Masks.
+  int8_t const* mask;
+  // Attention sinks (per head).
+  float const* attention_sinks;
+  // Softmax sum pointer.
+  float* softmax_sum;
+  // ALiBi
+  bool has_alibi;
+  // Dimensions of the problem.
+  size_t b, h;
+  // Precomputed constants.
+  size_t bhs, hs, bs;
+  // The scaling factors to apply when we convert to/from float.
+  float scale_bmm1, softcapping_scale_bmm1, scale_softmax;
+  // The number of reduction warps used by the fused kernel.
+  int warps_n;
+  int* cu_q_seqlens;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float to_float(uint16_t const& src, float) {
+  return fmha::half_to_float(src);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Disable warning #177-D because this function has not been used elsewhere
+#pragma nv_diag_suppress 177
+
+static inline __device__ float to_float(fmha::bf16_t const& src, float) {
+  return __bfloat162float(src);
+}
+
+#pragma nv_diag_default 177
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Disable warning #177-D because this function has not been used elsewhere
+#pragma nv_diag_suppress 177
+
+static inline __device__ float to_float(fmha::e4m3_t const& src, float) { return float(src); }
+
+#pragma nv_diag_default 177
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float to_float(float const& src, float) { return src; }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float to_float(int const& src, float scale) {
+  float dst;
+
+  // Convert from int to float.
+  dst = static_cast<float>(src);
+
+  // Scale.
+  dst *= scale;
+
+  return dst;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ void from_float(uint16_t& dst, float const& src, float) {
+  dst = fmha::float_to_half(src);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ void from_float(fmha::bf16_t& dst, float const& src, float) {
+  dst = fmha::float_to_bf16(src);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ int8_t float_to_int8_rn(float x) {
+  uint32_t dst;
+  asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x));
+  return reinterpret_cast<int8_t const&>(dst);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ void from_float(int8_t& dst, float const& src, float scale) {
+  dst = float_to_int8_rn(src * scale);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ void from_float(fmha::e4m3_t& dst, float const& src, float scale) {
+  dst = fmha::e4m3_t(src * scale);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float apply_exp_(float x, float max) {
+  return isinf(x) ? 0.f : __expf(x - max);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+static inline __device__ void reduce(float (&data_fp32)[N][1], int8_t const (&mask)[N][1],
+                                     int warps_n, float& sum_fp32, float& max_fp32,
+                                     float const attention_sink) {
+// Apply the masks.
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    data_fp32[ii][0] = mask[ii][0] ? data_fp32[ii][0] : -HUGE_VALF;
+  }
+
+  // Compute the max inside the thread.
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    max_fp32 = fmaxf(max_fp32, data_fp32[ii][0]);
+  }
+
+// Compute inside the warp.
+#pragma unroll
+  for (int xor_mask = THREADS_PER_WARP / 2; xor_mask > 0; xor_mask /= 2) {
+    max_fp32 = fmaxf(max_fp32, __shfl_xor_sync(uint32_t(-1), max_fp32, xor_mask));
+  }
+
+// Transform the elements.
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    data_fp32[ii][0] = apply_exp_(data_fp32[ii][0], max_fp32);
+  }
+
+  // Compute the max inside the thread.
+#if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+
+#pragma unroll
+  for (int ii = 0; ii < N; ii++) {
+    sum_fp32 += data_fp32[ii][0];  //+0    +64    +128
+  }
+
+  // Emulate tmp[0] + tmp[1]
+  sum_fp32 += __shfl_xor_sync(uint32_t(-1), sum_fp32, 4);
+  __syncwarp();
+
+  // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 1); __syncwarp();
+  sum_fp32 += __shfl_xor_sync(uint32_t(-1), sum_fp32, 1);
+  __syncwarp();
+  // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 2); __syncwarp();
+  sum_fp32 += __shfl_xor_sync(uint32_t(-1), sum_fp32, 2);
+  __syncwarp();
+
+  // Emulate final reduction
+  sum_fp32 += __shfl_xor_sync(uint32_t(-1), sum_fp32, 8);
+  __syncwarp();
+  sum_fp32 += __shfl_xor_sync(uint32_t(-1), sum_fp32, 16);
+  __syncwarp();
+
+#else
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    sum_fp32 += data_fp32[ii][0];
+  }
+
+// Compute inside the warp.
+#pragma unroll
+  for (int xor_mask = THREADS_PER_WARP / 2; xor_mask > 0; xor_mask /= 2) {
+    sum_fp32 += __shfl_xor_sync(uint32_t(-1), sum_fp32, xor_mask);
+  }
+#endif
+
+  // // DEBUG.
+  // if( blockIdx.z == 1 && threadIdx.y == 0 && threadIdx.x == 5 ) {
+  //   printf("elt=%12.8f sum_fp32=%12.8f\n", data_fp32[0].x, sum_fp32);
+  // }
+
+  // Fix the sum if needed.
+  if (sum_fp32 == 0.f || sum_fp32 != sum_fp32) {
+    sum_fp32 = 1.f;
+  }
+
+  // Normalize.
+  float inv_sum_fp32 = 1.f / (sum_fp32 + expf(attention_sink - max_fp32));
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    data_fp32[ii][0] *= inv_sum_fp32;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+static inline __device__ void reduce(float (&data_fp32)[N][2], int8_t const (&mask)[N][2],
+                                     int warps_n, float& sum_fp32, float& max_fp32,
+                                     float const attention_sink) {
+// Apply the masks.
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    data_fp32[ii][0] = mask[ii][0] ? data_fp32[ii][0] : -HUGE_VALF;
+    data_fp32[ii][1] = mask[ii][1] ? data_fp32[ii][1] : -HUGE_VALF;
+  }
+
+  // Compute the max inside the thread.
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    max_fp32 = fmaxf(max_fp32, data_fp32[ii][0]);
+    max_fp32 = fmaxf(max_fp32, data_fp32[ii][1]);
+  }
+
+// Compute inside the warp.
+#pragma unroll
+  for (int xor_mask = THREADS_PER_WARP / 2; xor_mask > 0; xor_mask /= 2) {
+    max_fp32 = fmaxf(max_fp32, __shfl_xor_sync(uint32_t(-1), max_fp32, xor_mask));
+  }
+
+// // DEBUG.
+// if( blockIdx.z == 1 && threadIdx.y == 0 && threadIdx.x == 5 ) {
+//   printf("elt=%12.8f max_fp32=%12.8f\n", data_fp32[0][0], max_fp32);
+// }
+// // END OF DEBUG.
+
+// Transform the elements.
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    data_fp32[ii][0] = apply_exp_(data_fp32[ii][0], max_fp32);
+    data_fp32[ii][1] = apply_exp_(data_fp32[ii][1], max_fp32);
+  }
+
+  // Compute the max inside the thread.
+  // float sum_fp32 = 0.f;
+#if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+  if (warps_n == 1) {
+    // TODO not sure if we can improve this on the gmma side without using additional regs.
+
+    // this is intentionally o(n) instead of o(log n)
+    // lanes 0 and 1 here represent the first quad.
+
+    // need to account for offset of l0 when addressing absolute lanes.
+    int const ti = threadIdx.x % 4;
+    float tmp = 0.f;
+
+    for (int ni = 0; ni < N; ni++) {
+      float x = data_fp32[ni][0] + data_fp32[ni][1];
+      tmp += x;
+
+      for (int it = 1; it < 8; it++) {
+        tmp += __shfl_sync(uint32_t(-1), x, 4 * it + ti);
+        __syncwarp();
+      }
+    }
+
+    // emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 1); __syncwarp();
+    tmp += __shfl_xor_sync(uint32_t(-1), tmp, 1);
+    __syncwarp();
+    // emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 2); __syncwarp();
+    tmp += __shfl_xor_sync(uint32_t(-1), tmp, 2);
+    __syncwarp();
+    sum_fp32 = __shfl_sync(uint32_t(-1), tmp, 0);
+  } else if (warps_n == 8) {
+    // Accumulate warp 0 and warp 4
+    float tmp[2] = {0.f, 0.f};
+#pragma unroll
+    for (int ii = 0; ii < N; ii += 2) {
+      tmp[0] += data_fp32[ii + 0][0];
+      tmp[0] += data_fp32[ii + 0][1];
+      tmp[1] += data_fp32[ii + 1][0];
+      tmp[1] += data_fp32[ii + 1][1];
+    }
+
+    // Emulate tmp[0] + tmp[1]
+    tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 4);
+    tmp[1] += __shfl_xor_sync(uint32_t(-1), tmp[1], 4);
+
+    // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 1); __syncwarp();
+    tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 1);
+    tmp[1] += __shfl_xor_sync(uint32_t(-1), tmp[1], 1);
+    // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 2); __syncwarp();
+    tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 2);
+    tmp[1] += __shfl_xor_sync(uint32_t(-1), tmp[1], 2);
+    // Emulate final reduction
+    tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 8);
+    tmp[1] += __shfl_xor_sync(uint32_t(-1), tmp[1], 8);
+
+    tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 16);
+    tmp[1] += __shfl_xor_sync(uint32_t(-1), tmp[1], 16);
+    sum_fp32 = tmp[0] + tmp[1];
+
+    sum_fp32 = __shfl_sync(uint32_t(-1), sum_fp32, 0);
+  } else {
+#pragma unroll
+    for (int ii = 0; ii < N; ii++) {
+      sum_fp32 += data_fp32[ii][0] + data_fp32[ii][1];
+    }
+
+    // Emulate tmp[0] + tmp[1]
+    sum_fp32 += __shfl_xor_sync(uint32_t(-1), sum_fp32, 4);
+    // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 1); __syncwarp();
+    sum_fp32 += __shfl_xor_sync(uint32_t(-1), sum_fp32, 1);
+    // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 2); __syncwarp();
+    sum_fp32 += __shfl_xor_sync(uint32_t(-1), sum_fp32, 2);
+    // Emulate final reduction
+    sum_fp32 += __shfl_xor_sync(uint32_t(-1), sum_fp32, 8);
+    sum_fp32 += __shfl_xor_sync(uint32_t(-1), sum_fp32, 16);
+
+    sum_fp32 = __shfl_sync(uint32_t(-1), sum_fp32, 0);
+  }
+
+#else
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    sum_fp32 += data_fp32[ii][0];
+    sum_fp32 += data_fp32[ii][1];
+  }
+
+// Compute inside the warp.
+#pragma unroll
+  for (int xor_mask = THREADS_PER_WARP / 2; xor_mask > 0; xor_mask /= 2) {
+    sum_fp32 += __shfl_xor_sync(uint32_t(-1), sum_fp32, xor_mask);
+  }
+#endif
+
+  // // DEBUG.
+  // if( blockIdx.z == 1 && threadIdx.y == 0 && threadIdx.x == 5 ) {
+  //   printf("elt=%12.8f sum_fp32=%12.8f\n", data_fp32[0][0], sum_fp32);
+  // }
+
+  // Fix the sum if needed.
+  if (sum_fp32 == 0.f || sum_fp32 != sum_fp32) {
+    sum_fp32 = 1.f;
+  }
+
+  // Normalize.
+  float inv_sum_fp32 = 1.f / (sum_fp32 + expf(attention_sink - max_fp32));
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    data_fp32[ii][0] *= inv_sum_fp32;
+    data_fp32[ii][1] *= inv_sum_fp32;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+static inline __device__ void reduce(float (&data_fp32)[N][4], int8_t const (&mask)[N][4],
+                                     int warps_n, float& sum_fp32, float& max_fp32,
+                                     float const attention_sink) {
+// Apply the masks.
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    data_fp32[ii][0] = mask[ii][0] ? data_fp32[ii][0] : -HUGE_VALF;
+    data_fp32[ii][1] = mask[ii][1] ? data_fp32[ii][1] : -HUGE_VALF;
+    data_fp32[ii][2] = mask[ii][2] ? data_fp32[ii][2] : -HUGE_VALF;
+    data_fp32[ii][3] = mask[ii][3] ? data_fp32[ii][3] : -HUGE_VALF;
+  }
+
+  // Compute the max inside the thread.
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    max_fp32 = fmaxf(max_fp32, data_fp32[ii][0]);
+    max_fp32 = fmaxf(max_fp32, data_fp32[ii][1]);
+    max_fp32 = fmaxf(max_fp32, data_fp32[ii][2]);
+    max_fp32 = fmaxf(max_fp32, data_fp32[ii][3]);
+  }
+
+// Compute inside the warp.
+#pragma unroll
+  for (int xor_mask = THREADS_PER_WARP / 2; xor_mask > 0; xor_mask /= 2) {
+    max_fp32 = fmaxf(max_fp32, __shfl_xor_sync(uint32_t(-1), max_fp32, xor_mask));
+  }
+
+// // DEBUG.
+// if( blockIdx.z == 1 && threadIdx.y == 0 && threadIdx.x == 5 ) {
+//   printf("elt=%12.8f max_fp32=%12.8f\n", data_fp32[0][0], max_fp32);
+// }
+
+// Transform the elements.
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    data_fp32[ii][0] = apply_exp_(data_fp32[ii][0], max_fp32);
+    data_fp32[ii][1] = apply_exp_(data_fp32[ii][1], max_fp32);
+    data_fp32[ii][2] = apply_exp_(data_fp32[ii][2], max_fp32);
+    data_fp32[ii][3] = apply_exp_(data_fp32[ii][3], max_fp32);
+  }
+
+  // Compute the max inside the thread.
+  // float sum_fp32 = 0.f;
+
+  // TODO needs refactoring...
+
+#if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE)
+  // Within a thread it should correspond to the operation done in the tmp[0]/[1] loop.
+
+  if (warps_n == 1) {  // E.g. 4x1: 4 threads iterate over all cores.
+    // TODO not sure if we can improve this on the gmma side without using additional regs.
+
+    // this is intentionally o(n) instead of o(log n)
+    // lanes 0 and 1 here represent the first quad.
+
+    // need to account for offset of l0 when addressing absolute lanes.
+    int const ti = threadIdx.x % 2;
+    float tmp[2] = {0.f, 0.f};
+
+    for (int ni = 0; ni < N; ni++) {
+      // +1
+      float x = data_fp32[ni][0] + data_fp32[ni][1];
+      float y = data_fp32[ni][2] + data_fp32[ni][3];
+      tmp[0] += x;
+      tmp[1] += y;
+
+      for (int it = 1; it < 16; it++) {
+        tmp[0] += __shfl_sync(uint32_t(-1), x, 2 * it + ti);
+        __syncwarp();
+        tmp[1] += __shfl_sync(uint32_t(-1), y, 2 * it + ti);
+        __syncwarp();
+      }
+    }
+
+    // emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 1); __syncwarp();
+    tmp[0] += tmp[1];
+    // emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 2); __syncwarp();
+    tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 1);
+    __syncwarp();
+    sum_fp32 = __shfl_sync(uint32_t(-1), tmp[0], 0);
+  } else {
+    // SEQLEN == 128.
+    if (N == 1) {
+      float tmp[2] = {0.f, 0.f};
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 700  // GV100
+      // The thread local reduction.
+      tmp[0] += data_fp32[0][0];
+      tmp[0] += data_fp32[0][1];
+      tmp[0] += data_fp32[0][2];
+      tmp[0] += data_fp32[0][3];
+
+      // Add threads 0 and 2. Inside a thread in the impl.
+      tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 2);
+      __syncwarp();
+      // Add threads 0 and 8. Inside the thread.
+      tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 8);
+      __syncwarp();
+      // Add threads 0 and 16. Inside the thread.
+      tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 16);
+      __syncwarp();
+
+      // Add threads 0 and 1.
+      tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 1);
+      __syncwarp();
+
+      // Add threads 0 and 4. Inter-warp in the code.
+      tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 4);
+      __syncwarp();
+#else
+      if (warps_n == 2) {  // 2x2
+        tmp[0] += data_fp32[0][0] + data_fp32[0][1];
+        tmp[1] += data_fp32[0][2] + data_fp32[0][3];
+
+        // Emulate a_01 += a_23...
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 2);
+        __syncwarp();
+        tmp[1] += __shfl_xor_sync(uint32_t(-1), tmp[1], 2);
+        __syncwarp();
+
+        // Emulate a_01 += a_45...
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 8);
+        __syncwarp();
+        tmp[1] += __shfl_xor_sync(uint32_t(-1), tmp[1], 8);
+        __syncwarp();
+
+        // Emulate a_01 += a_89...
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 16);
+        __syncwarp();
+        tmp[1] += __shfl_xor_sync(uint32_t(-1), tmp[1], 16);
+        __syncwarp();
+
+        // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 1); __syncwarp();
+        tmp[0] += tmp[1];
+
+        // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 2); __syncwarp();
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 1);
+        __syncwarp();
+
+        // Emulate the final reduction in smem.
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 4);
+        __syncwarp();
+      } else {  // 1x4
+        tmp[0] += data_fp32[0][0] + data_fp32[0][1];
+        tmp[1] += data_fp32[0][2] + data_fp32[0][3];
+
+        // Add +64.
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 16);
+        __syncwarp();
+        tmp[1] += __shfl_xor_sync(uint32_t(-1), tmp[1], 16);
+        __syncwarp();
+
+        // T0: Emulate dst[mi] = tmp[mi][0] + tmp[mi][1];
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 2);
+        __syncwarp();
+        // T1: Emulate dst[mi] = tmp[mi][0] + tmp[mi][1];
+        tmp[1] += __shfl_xor_sync(uint32_t(-1), tmp[1], 2);
+        __syncwarp();
+
+        // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 1); __syncwarp();
+        tmp[0] += tmp[1];
+
+        // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 2); __syncwarp();
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 1);
+        __syncwarp();
+        // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 4); __syncwarp();
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 4);
+        __syncwarp();
+        // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 8); __syncwarp();
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 8);
+        __syncwarp();
+      }
+
+#endif  // ! GV100
+
+      // Don't forget to put the value in sum_fp32 :)
+      // sum_fp32 = tmp[0];
+      sum_fp32 = __shfl_sync(uint32_t(-1), tmp[0], 0);
+
+      // SEQLEN == 256 - compare with 1x4.
+    } else if (N == 2 || N == 8) {
+#pragma unroll
+      for (int step = 0; step < N; step += 2) {
+        float tmp[2] = {0.f, 0.f};
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 700  // GV100
+
+        // The thread local reduction.
+        tmp[0] += data_fp32[step + 0][0];
+        tmp[0] += data_fp32[step + 0][1];
+        tmp[0] += data_fp32[step + 0][2];
+        tmp[0] += data_fp32[step + 0][3];
+
+        tmp[1] += data_fp32[step + 1][0];
+        tmp[1] += data_fp32[step + 1][1];
+        tmp[1] += data_fp32[step + 1][2];
+        tmp[1] += data_fp32[step + 1][3];
+
+        // Sum offset 0 and 128 (and so on).
+        tmp[0] += tmp[1];
+
+        // Add threads 0 and 2. Inside a thread in the impl.
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 2);
+        __syncwarp();
+        // Add threads 0 and 16. Inside the thread.
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 16);
+        __syncwarp();
+
+        // Add threads 0 and 1.
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 1);
+        __syncwarp();
+
+        // Add threads 0 and 4. Inter-warp in the code.
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 4);
+        __syncwarp();
+        // Add threads 0 and 8. Inter-warp in the code.
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 8);
+        __syncwarp();
+#else
+        // 0.
+        tmp[0] += data_fp32[step + 0][0] + data_fp32[step + 0][1];
+        tmp[1] += data_fp32[step + 0][2] + data_fp32[step + 0][3];
+
+        // Add +64.
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 16);
+        __syncwarp();
+        tmp[1] += __shfl_xor_sync(uint32_t(-1), tmp[1], 16);
+        __syncwarp();
+
+        // Add +128 but use temp storage due to the next round of shfl.
+        float xy = data_fp32[step + 1][0] + data_fp32[step + 1][1];
+        float zw = data_fp32[step + 1][2] + data_fp32[step + 1][3];
+
+        // Add +128.
+        tmp[0] += xy;
+        tmp[1] += zw;
+
+        // Add +192.
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), xy, 16);
+        __syncwarp();
+        tmp[1] += __shfl_xor_sync(uint32_t(-1), zw, 16);
+        __syncwarp();
+
+        // T0: Emulate dst[mi] = tmp[mi][0] + tmp[mi][1];
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 2);
+        __syncwarp();
+        // T1: Emulate dst[mi] = tmp[mi][0] + tmp[mi][1];
+        tmp[1] += __shfl_xor_sync(uint32_t(-1), tmp[1], 2);
+        __syncwarp();
+
+        // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 1); __syncwarp();
+        tmp[0] += tmp[1];
+
+        // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 2); __syncwarp();
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 1);
+        __syncwarp();
+        // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 4); __syncwarp();
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 4);
+        __syncwarp();
+        // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 8); __syncwarp();
+        tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 8);
+        __syncwarp();
+#endif  // ! GV100
+
+        // Don't forget to put the value in sum_fp32 :)
+        sum_fp32 += tmp[0];
+      }
+      // Emulate taking warp results from position 0, 16, 32, 48, etc.
+      sum_fp32 = __shfl_sync(uint32_t(-1), sum_fp32, 0);
+
+      // SEQLEN == 384.
+    } else if (N == 3) {
+      float tmp[2] = {0.f, 0.f};
+
+// The reduction inside the thread.
+#pragma unroll
+      for (int ii = 0; ii < N; ++ii) {
+        tmp[0] += data_fp32[ii][0];
+        tmp[0] += data_fp32[ii][1];
+        tmp[1] += data_fp32[ii][2];
+        tmp[1] += data_fp32[ii][3];
+      }
+
+      // Emulate dst[mi] = tmp[mi][0] + tmp[mi][1];
+      tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 2);
+      __syncwarp();
+      tmp[1] += __shfl_xor_sync(uint32_t(-1), tmp[1], 2);
+      __syncwarp();
+
+      // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 1); __syncwarp();
+      tmp[0] += tmp[1];
+
+      // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 2); __syncwarp();
+      tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 1);
+      __syncwarp();
+
+      // Emulate the final summation.
+      tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 4);
+      __syncwarp();
+      tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 8);
+      __syncwarp();
+      tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 16);
+      __syncwarp();
+
+      // Don't forget to put the value in sum_fp32 :)
+      sum_fp32 += tmp[0];
+      // SEQLEN == 512 - compare with 1x8.
+    } else if (N >= 4) {
+      // Emulate thread local
+      float tmp[2] = {0.f, 0.f};  // T0, T1
+#pragma unroll
+      for (int step = 0; step < N; step++) {
+        tmp[0] += data_fp32[step][0];  // + 0
+        tmp[0] += data_fp32[step][1];  // + 1
+        tmp[1] += data_fp32[step][2];  // + 2
+        tmp[1] += data_fp32[step][3];  // + 3
+      }
+
+      // T0: Emulate dst[mi] = tmp[mi][0] + tmp[mi][1];
+      tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 2);
+      __syncwarp();
+      // T1: Emulate dst[mi] = tmp[mi][0] + tmp[mi][1];
+      tmp[1] += __shfl_xor_sync(uint32_t(-1), tmp[1], 2);
+      __syncwarp();
+
+      // Emulate intra-thread
+      // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 1); __syncwarp();
+      tmp[0] += tmp[1];
+      // Emulate dst[mi] += __shfl_xor_sync(uint32_t(-1), dst[mi], 2); __syncwarp();
+      tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 1);
+      __syncwarp();
+
+      // Emulate inter-thread
+      tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 4);
+      __syncwarp();
+      tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 8);
+      __syncwarp();
+      tmp[0] += __shfl_xor_sync(uint32_t(-1), tmp[0], 16);
+      __syncwarp();
+
+      // Don't forget to put the value in sum_fp32 :)
+      // sum_fp32 = tmp[0];
+
+      // Emulate taking warp results from position 0, 16, 32, 48, etc.
+      sum_fp32 = __shfl_sync(uint32_t(-1), tmp[0], 0);
+      // Not supported.
+    } else {
+      assert(false);
+    }
+  }  // warps_n ==  1
+#else
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    sum_fp32 += data_fp32[ii][0];
+    sum_fp32 += data_fp32[ii][1];
+    sum_fp32 += data_fp32[ii][2];
+    sum_fp32 += data_fp32[ii][3];
+  }
+
+// Compute inside the warp.
+#pragma unroll
+  for (int xor_mask = THREADS_PER_WARP / 2; xor_mask > 0; xor_mask /= 2) {
+    sum_fp32 += __shfl_xor_sync(uint32_t(-1), sum_fp32, xor_mask);
+  }
+#endif
+
+  // // DEBUG.
+  // if( blockIdx.x == 0 && threadIdx.y == 0 && threadIdx.x == 0 ) {
+  //     printf("elt=%12.8f sum_fp32=%12.8f\n", data_fp32[0][0], sum_fp32);
+  // }
+  // // END OF DEBUG.
+
+  // Fix the sum if needed.
+  if (sum_fp32 == 0.f || sum_fp32 != sum_fp32) {
+    sum_fp32 = 1.f;
+  }
+
+  // Normalize.
+  float inv_sum_fp32 = 1.f / (sum_fp32 + expf(attention_sink - max_fp32));
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    data_fp32[ii][0] *= inv_sum_fp32;
+    data_fp32[ii][1] *= inv_sum_fp32;
+    data_fp32[ii][2] *= inv_sum_fp32;
+    data_fp32[ii][3] *= inv_sum_fp32;
+  }
+}
+
+template <typename Data_type, int X>
+struct VecX {
+  using Type = typename fmha::Uint_from_size_in_bytes<X * sizeof(Data_type)>::Type;
+  static_assert(sizeof(Type) == X * sizeof(Data_type));
+
+  union Alias {
+    Type raw;
+    Data_type elt[X];
+  };
+
+  static __device__ inline void to_floatX(float (&dst)[X], Type const& src, float const scale,
+                                          float const attn_logit_softcapping_scale) {
+    Alias tmp;
+    tmp.raw = src;
+#pragma unroll
+    for (int it = 0; it < X; it++) {
+      dst[it] = to_float(tmp.elt[it], scale);
+      if (attn_logit_softcapping_scale != 0.f) {
+        dst[it] =
+            attn_logit_softcapping_scale * fmha::__tanhf(dst[it] / attn_logit_softcapping_scale);
+      }
+    }
+  }
+
+  static __device__ inline void from_floatX(Type& dst, float const (&src)[X], float const scale) {
+    Alias tmp;
+#pragma unroll
+    for (int it = 0; it < X; it++) {
+      from_float(tmp.elt[it], src[it], scale);
+    }
+    dst = tmp.raw;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float get_alibi_head_scaling_factor(int const head_id, int const num_heads) {
+  // Round down to power of 2
+  int const num_heads_pow2 = (1u << (31 - __clz(num_heads)));
+  if (head_id < num_heads_pow2) {
+    return exp2f((head_id + 1) * -8.0f / num_heads_pow2);
+  } else {
+    float const adjusted_head_id = 2 * (head_id - num_heads_pow2) + 1;
+    return exp2f(adjusted_head_id * -4.0f / num_heads_pow2);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Dst_type, typename Src_type, int SEQLEN, int WARPS_PER_CTA, int X = 4>
+static __global__ void softmax_kernel(Softmax_params<Dst_type, Src_type> params) {
+  // By default, use LDG.64 for the loads and STG.64 for the stores.
+  enum { ELEMENTS_PER_LDG = X, ELEMENTS_PER_STG = X };
+
+  // The number of Vec_type per thread.
+  enum { VECs_PER_THREAD = SEQLEN / THREADS_PER_WARP / ELEMENTS_PER_LDG };
+
+  // DEBUG.
+  static_assert(VECs_PER_THREAD * THREADS_PER_WARP * ELEMENTS_PER_LDG == SEQLEN, "");
+  // END OF DEBUG.
+
+  using VecO = VecX<Dst_type, X>;
+  using VecI = VecX<Src_type, X>;
+  using VecM = VecX<int8_t, X>;
+  // The vector types.
+  using DstX_type = typename VecO::Type;
+  using SrcX_type = typename VecI::Type;
+
+  // Make sure the sizes match our expectations.
+  static_assert(sizeof(DstX_type) == X * sizeof(Dst_type));
+  static_assert(sizeof(SrcX_type) == X * sizeof(Src_type));
+
+  // The type of the mask.
+  using MaskX_type = typename VecM::Type;
+
+  // One warp per sequence.
+  size_t hi = blockIdx.y * WARPS_PER_CTA + threadIdx.y;
+  size_t bi = blockIdx.z;
+  size_t si = blockIdx.x;
+
+  // The data offset. Layout is S * B * H * S.
+  size_t src_offset =
+      si * params.bhs + bi * params.hs + hi * SEQLEN + threadIdx.x * ELEMENTS_PER_LDG;
+
+  // Load the input elements.
+  SrcX_type const* src_ptr = reinterpret_cast<SrcX_type const*>(&params.src[src_offset]);
+  SrcX_type data_src[VECs_PER_THREAD];
+#pragma unroll
+  for (int ii = 0; ii < VECs_PER_THREAD; ++ii) {
+    if (hi < params.h) {
+      data_src[ii] = src_ptr[ii * THREADS_PER_WARP];
+    }
+  }
+
+  // The mask offset. Layout is S * B * S.
+  size_t mask_offset = si * params.bs + bi * SEQLEN + threadIdx.x * ELEMENTS_PER_LDG;
+
+  // Load the masks.
+  MaskX_type const* mask_ptr = reinterpret_cast<MaskX_type const*>(&params.mask[mask_offset]);
+  MaskX_type mask[VECs_PER_THREAD];
+#pragma unroll
+  for (int ii = 0; ii < VECs_PER_THREAD; ++ii) {
+    mask[ii] = mask_ptr[ii * THREADS_PER_WARP];
+  }
+
+  // Convert the data to float.
+  float data_fp32[VECs_PER_THREAD][X];
+  int8_t mask_[VECs_PER_THREAD][X];
+#pragma unroll
+  for (int ii = 0; ii < VECs_PER_THREAD; ++ii) {
+    VecI::to_floatX(data_fp32[ii], data_src[ii], params.scale_bmm1, params.softcapping_scale_bmm1);
+
+    typename VecM::Alias tmp;
+    tmp.raw = mask[ii];
+#pragma unroll
+    for (int it = 0; it < X; it++) {
+      mask_[ii][it] = tmp.elt[it];
+    }
+  }
+
+  if (params.has_alibi) {
+    float const alibi_factor = get_alibi_head_scaling_factor(hi, params.h);
+#pragma unroll
+    for (int ii = 0; ii < VECs_PER_THREAD; ii++) {
+#pragma unroll
+      for (int jj = 0; jj < X; jj++) {
+        int col = ii * THREADS_PER_WARP * X + threadIdx.x * X + jj;
+        data_fp32[ii][jj] += alibi_factor * col;
+      }
+    }
+  }
+
+  // The attention sink value.
+  float attention_sink = -FLT_MAX;
+  if (params.attention_sinks != nullptr) {
+    attention_sink = params.attention_sinks[hi];
+  }
+
+  // Do the reduction.
+  float sum_fp32 = 0.f;
+  float max_fp32 = -HUGE_VALF;
+  reduce(data_fp32, mask_, params.warps_n, sum_fp32, max_fp32, attention_sink);
+  if (threadIdx.x == 0) {
+    int sum_s = params.cu_q_seqlens[bi];
+    // [B, S, H, 2] {max, sum} float
+    if (hi < params.h) {
+      params.softmax_sum[(sum_s + si) * params.h * 2 + hi * 2] = max_fp32;
+      params.softmax_sum[(sum_s + si) * params.h * 2 + hi * 2 + 1] = sum_fp32;
+    }
+  }
+  // Reconvert to half.
+  DstX_type data_dst[VECs_PER_THREAD];
+#pragma unroll
+  for (int ii = 0; ii < VECs_PER_THREAD; ++ii) {
+    VecO::from_floatX(data_dst[ii], data_fp32[ii], params.scale_softmax);
+  }
+
+  // Store the output elements.
+  DstX_type* dst_ptr = reinterpret_cast<DstX_type*>(&params.dst[src_offset]);
+#pragma unroll
+  for (int ii = 0; ii < VECs_PER_THREAD; ++ii) {
+    if (hi < params.h) {
+      dst_ptr[ii * THREADS_PER_WARP] = data_dst[ii];
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Dst_type, typename Src_type>
+void run_softmax(void* dst, void const* src, void const* mask, void const* attention_sinks,
+                 void* softmax_sum, void* cu_q_seqlens, int s_inner, int s_outer, int b, int h,
+                 float scale_bmm1, float scale_softmax, float softcapping_scale_bmm1, int warps_n,
+                 bool has_alibi) {
+  printf("Softmax impl\n");
+  Softmax_params<Dst_type, Src_type> params;
+  memset(&params, 0, sizeof(params));
+
+  // The different pointers.
+  params.dst = reinterpret_cast<Dst_type*>(dst);
+  params.src = reinterpret_cast<Src_type const*>(src);
+  params.softmax_sum = reinterpret_cast<float*>(softmax_sum);
+  params.cu_q_seqlens = reinterpret_cast<int*>(cu_q_seqlens);
+  params.mask = reinterpret_cast<int8_t const*>(mask);
+  params.attention_sinks = reinterpret_cast<float const*>(attention_sinks);
+  params.has_alibi = has_alibi;
+
+  // The dimensions and precomputed values.
+  params.b = b;
+  params.h = h;
+  params.bhs = b * h * s_inner;
+  params.hs = h * s_inner;
+  params.bs = b * s_inner;
+
+  // The scaling factors for the int8 version to convert to/from float.
+  params.scale_bmm1 = scale_bmm1;
+  params.softcapping_scale_bmm1 = softcapping_scale_bmm1;
+  params.scale_softmax = scale_softmax;
+  // The number of warps_n used to identify the reduction strategy.
+  params.warps_n = warps_n;
+
+  // Compute the grid size.
+  enum { WARPS_PER_CTA = 4 };
+
+  dim3 grid(s_outer, (h + WARPS_PER_CTA - 1) / WARPS_PER_CTA, b);
+  dim3 threads_per_cta(THREADS_PER_WARP, WARPS_PER_CTA);
+
+  // Launch the kernel.
+  if (s_inner == 32) {
+    softmax_kernel<Dst_type, Src_type, 32, WARPS_PER_CTA, 1><<<grid, threads_per_cta>>>(params);
+  } else if (s_inner == 64) {
+    softmax_kernel<Dst_type, Src_type, 64, WARPS_PER_CTA, 2><<<grid, threads_per_cta>>>(params);
+  } else if (s_inner == 96) {
+    softmax_kernel<Dst_type, Src_type, 96, WARPS_PER_CTA, 1><<<grid, threads_per_cta>>>(params);
+  } else if (s_inner == 128) {
+    softmax_kernel<Dst_type, Src_type, 128, WARPS_PER_CTA><<<grid, threads_per_cta>>>(params);
+  } else if (s_inner == 192) {
+    softmax_kernel<Dst_type, Src_type, 192, WARPS_PER_CTA, 2><<<grid, threads_per_cta>>>(params);
+  } else if (s_inner == 256) {
+    softmax_kernel<Dst_type, Src_type, 256, WARPS_PER_CTA><<<grid, threads_per_cta>>>(params);
+  } else if (s_inner == 384) {
+    softmax_kernel<Dst_type, Src_type, 384, WARPS_PER_CTA, 2><<<grid, threads_per_cta>>>(params);
+  } else if (s_inner == 512) {
+    softmax_kernel<Dst_type, Src_type, 512, WARPS_PER_CTA><<<grid, threads_per_cta>>>(params);
+  } else if (s_inner == 1024) {
+    softmax_kernel<Dst_type, Src_type, 1024, WARPS_PER_CTA><<<grid, threads_per_cta>>>(params);
+  } else if (s_inner == 2048) {
+    softmax_kernel<Dst_type, Src_type, 2048, WARPS_PER_CTA><<<grid, threads_per_cta>>>(params);
+  } else if (s_inner == 4096) {
+    softmax_kernel<Dst_type, Src_type, 4096, WARPS_PER_CTA><<<grid, threads_per_cta>>>(params);
+  } else if (s_inner == 8192) {
+    softmax_kernel<Dst_type, Src_type, 8192, WARPS_PER_CTA><<<grid, threads_per_cta>>>(params);
+  } else if (s_inner == 16384) {
+    softmax_kernel<Dst_type, Src_type, 16384, WARPS_PER_CTA><<<grid, threads_per_cta>>>(params);
+  } else if (s_inner == 32768) {
+    softmax_kernel<Dst_type, Src_type, 32768, WARPS_PER_CTA><<<grid, threads_per_cta>>>(params);
+  } else if (s_inner == 65536) {
+    softmax_kernel<Dst_type, Src_type, 65536, WARPS_PER_CTA><<<grid, threads_per_cta>>>(params);
+  } else {
+    assert(false);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/fmha_v2/softmax_int8.cu b/csrc/fmha_v2/softmax_int8.cu
new file mode 100644
index 0000000000..a0146338e0
--- /dev/null
+++ b/csrc/fmha_v2/softmax_int8.cu
@@ -0,0 +1,22 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include "softmax_impl.h"
+
+void run_softmax_int8(void* dst, void const* src, void const* mask, void const* attention_sinks,
+                      void* softmax_sum_d, void* cu_q_seqlens_d, int s_inner, int s_outer, int b,
+                      int h, float scale_bmm1, float scale_softmax, float softcapping_scale_bmm1,
+                      int warps_n, bool has_alibi) {
+  run_softmax<int8_t, int32_t>(dst, src, mask, attention_sinks, softmax_sum_d, cu_q_seqlens_d,
+                               s_inner, s_outer, b, h, scale_bmm1, scale_softmax,
+                               softcapping_scale_bmm1, warps_n, has_alibi);
+}
diff --git a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_instantiation.cu b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_instantiation.cu
index f20729f163..6469b9a0cd 100644
--- a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_instantiation.cu
+++ b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_instantiation.cu
@@ -18,7 +18,6 @@
 #include "moe_kernels.h"
 
 namespace tensorrt_llm::kernels::cutlass_kernels {
-// ==================== Variable batched GEMM specializations ==================================
 template class CutlassMoeFCRunner<float, float>;
 
 #ifdef ENABLE_BF16
@@ -38,6 +37,7 @@ template class CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, half, half>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16>;
 template class CutlassMoeFCRunner<__nv_bfloat16, __nv_fp8_e4m3, __nv_bfloat16>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16>;
+template class CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, __nv_bfloat16, __nv_fp8_e4m3>;
 #endif
 #endif
 #ifdef ENABLE_FP4
@@ -54,4 +54,12 @@ template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, __nv_bfloat16, _
 template class CutlassMoeFCRunner<__nv_bfloat16, __nv_fp4_e2m1>;
 #endif
 #endif
-};  // namespace tensorrt_llm::kernels::cutlass_kernels
+
+// Explicit instantiations for finalizeMoeRoutingKernelLauncher to ensure
+// symbols are emitted in the JIT library for common data types.
+INSTANTIATE_FINALIZE_MOE_ROUTING(half, half, half);
+INSTANTIATE_FINALIZE_MOE_ROUTING(float, float, float);
+#ifdef ENABLE_BF16
+INSTANTIATE_FINALIZE_MOE_ROUTING(__nv_bfloat16, __nv_bfloat16, __nv_bfloat16);
+#endif
+}  // namespace tensorrt_llm::kernels::cutlass_kernels
diff --git a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
index 85a77d7283..465241546d 100644
--- a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
+++ b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -284,6 +284,7 @@ void buildMinLatencyActiveExpertMaps(
                      num_tokens, experts_per_token, start_expert, end_expert, num_experts_per_node,
                      smart_routing, cluster_rank, cluster_size, num_experts_smem);
 }
+
 template <int BLOCK_SIZE, int EXPERTS_PER_TOKEN, int LOG2_NUM_EXPERTS>
 __global__ void fusedBuildExpertMapsSortFirstTokenKernel(
     int const* const token_selected_experts, int* const permuted_row_to_unpermuted_row,
@@ -1007,7 +1008,8 @@ __device__ void writeSF(int64_t num_tokens_before_expert, int64_t expert_id,
                         int64_t source_token_id, int64_t token_id, int64_t elem_idx,
                         int64_t num_cols,
                         TmaWarpSpecializedGroupedGemmInput::ElementSF* act_sf_flat,
-                        TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf) {
+                        TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf,
+                        bool const swizzled_input_sf = true) {
   static constexpr int NumThreadsPerSF = VecSize / ElementsPerThread;
 
   // We need to offset into the scaling factors for just this expert
@@ -1027,12 +1029,25 @@ __device__ void writeSF(int64_t num_tokens_before_expert, int64_t expert_id,
           QuantizationSFLayout::SWIZZLED_128x4);
   if (sf_out) {
     if (input_sf) {
-      auto const sf_in = cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF,
-                                                     NumThreadsPerSF>(
-          std::nullopt /* batchIdx */, source_token_id, elem_idx, std::nullopt /* numRows */,
-          num_cols / VecSize, const_cast<TmaWarpSpecializedGroupedGemmInput::ElementSF*>(input_sf),
-          QuantizationSFLayout::SWIZZLED_128x4);
-      *sf_out = *sf_in;
+      if (swizzled_input_sf) {
+        auto const sf_in =
+            cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF,
+                                        NumThreadsPerSF>(
+                std::nullopt /* batchIdx */, source_token_id, elem_idx, std::nullopt /* numRows */,
+                num_cols / VecSize,
+                const_cast<TmaWarpSpecializedGroupedGemmInput::ElementSF*>(input_sf),
+                QuantizationSFLayout::SWIZZLED_128x4);
+        *sf_out = *sf_in;
+      } else {
+        auto const sf_in =
+            cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF,
+                                        NumThreadsPerSF>(
+                std::nullopt /* batchIdx */, source_token_id, elem_idx, std::nullopt /* numRows */,
+                num_cols / VecSize,
+                const_cast<TmaWarpSpecializedGroupedGemmInput::ElementSF*>(input_sf),
+                QuantizationSFLayout::LINEAR);
+        *sf_out = *sf_in;
+      }
     } else {
       *sf_out = 0x00;
     }
@@ -1075,18 +1090,25 @@ __device__ void setupFP4BlockScalingFactors(
     TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat,
     TmaWarpSpecializedGroupedGemmInput::ElementSF const* weight_block_scale,
     int64_t num_tokens_before_expert) {
-  assert(layout_info.fpX_block_scaling_factors_stride_A);
-  assert(layout_info.fpX_block_scaling_factors_stride_B);
-
-  // M & N swapped for transpose
-  auto stride_a_ptr = reinterpret_cast<typename BSConfig::LayoutSF*>(
-      layout_info.fpX_block_scaling_factors_stride_A);
-  auto stride_b_ptr = reinterpret_cast<typename BSConfig::LayoutSF*>(
-      layout_info.fpX_block_scaling_factors_stride_B);
-  stride_a_ptr[expert] = BSConfig::tile_atom_to_shape_SFB(
-      cute::make_shape((int)gemm_n, (int)gemm_m, (int)gemm_k, (int)1));
-  stride_b_ptr[expert] = BSConfig::tile_atom_to_shape_SFA(
-      cute::make_shape((int)gemm_n, (int)gemm_m, (int)gemm_k, (int)1));
+  assert(layout_info.fpX_block_scaling_factors_stride_act);
+  assert(layout_info.fpX_block_scaling_factors_stride_weight);
+
+  auto stride_act_ptr = reinterpret_cast<typename BSConfig::LayoutSF*>(
+      layout_info.fpX_block_scaling_factors_stride_act);
+  auto stride_weight_ptr = reinterpret_cast<typename BSConfig::LayoutSF*>(
+      layout_info.fpX_block_scaling_factors_stride_weight);
+  if (layout_info.swap_ab) {
+    // M & N swapped for transpose
+    stride_act_ptr[expert] = BSConfig::tile_atom_to_shape_SFB(
+        cute::make_shape((int)gemm_n, (int)gemm_m, (int)gemm_k, (int)1));
+    stride_weight_ptr[expert] = BSConfig::tile_atom_to_shape_SFA(
+        cute::make_shape((int)gemm_n, (int)gemm_m, (int)gemm_k, (int)1));
+  } else {
+    stride_act_ptr[expert] = BSConfig::tile_atom_to_shape_SFA(
+        cute::make_shape((int)gemm_m, (int)gemm_n, (int)gemm_k, (int)1));
+    stride_weight_ptr[expert] = BSConfig::tile_atom_to_shape_SFB(
+        cute::make_shape((int)gemm_m, (int)gemm_n, (int)gemm_k, (int)1));
+  }
 
   // This assert validates our current assumption that A&B can be safely transposed without needing
   // to modify
@@ -1099,30 +1121,51 @@ __device__ void setupFP4BlockScalingFactors(
       std::is_same_v<BSConfig, TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaledConfig>
           ? TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::NVFP4
           : TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::MXFPX;
-  layout_info.fpX_block_scaling_factors_A[expert] =
+  layout_info.fpX_block_scaling_factors_act[expert] =
       fp4_act_flat + getOffsetActivationSF(expert, num_tokens_before_expert, gemm_k, scaling_type);
 
-  layout_info.fpX_block_scaling_factors_B[expert] =
+  layout_info.fpX_block_scaling_factors_weight[expert] =
       weight_block_scale + getOffsetWeightSF(expert, gemm_n, gemm_k, scaling_type);
 }
 
 __device__ void computeTmaWarpSpecializedInputStrides(
     TmaWarpSpecializedGroupedGemmInput& layout_info, int gemm_m, int gemm_n, int gemm_k,
     int64_t out_idx) {
-  layout_info.stride_a[out_idx] = cutlass::make_cute_packed_stride(
-      TmaWarpSpecializedGroupedGemmInput::StrideA{}, cute::make_shape(gemm_m, gemm_k, 1));
-  layout_info.stride_b[out_idx] = cutlass::make_cute_packed_stride(
-      TmaWarpSpecializedGroupedGemmInput::StrideB{}, cute::make_shape(gemm_n, gemm_k, 1));
+  if (layout_info.swap_ab) {
+    reinterpret_cast<TmaWarpSpecializedGroupedGemmInput::StrideB*>(
+        layout_info.stride_act)[out_idx] =
+        cutlass::make_cute_packed_stride(TmaWarpSpecializedGroupedGemmInput::StrideB{},
+                                         cute::make_shape(gemm_m, gemm_k, 1));
+    reinterpret_cast<TmaWarpSpecializedGroupedGemmInput::StrideA*>(
+        layout_info.stride_weight)[out_idx] =
+        cutlass::make_cute_packed_stride(TmaWarpSpecializedGroupedGemmInput::StrideA{},
+                                         cute::make_shape(gemm_n, gemm_k, 1));
+  } else {
+    reinterpret_cast<TmaWarpSpecializedGroupedGemmInput::StrideA*>(
+        layout_info.stride_act)[out_idx] =
+        cutlass::make_cute_packed_stride(TmaWarpSpecializedGroupedGemmInput::StrideA{},
+                                         cute::make_shape(gemm_m, gemm_k, 1));
+    reinterpret_cast<TmaWarpSpecializedGroupedGemmInput::StrideB*>(
+        layout_info.stride_weight)[out_idx] =
+        cutlass::make_cute_packed_stride(TmaWarpSpecializedGroupedGemmInput::StrideB{},
+                                         cute::make_shape(gemm_n, gemm_k, 1));
+  }
   if (layout_info.stride_c) {
+    // TODO Enable 1xN bias matrix as C
     assert(false && "CUTLASS does not support a 1xN bias");
-    //        layout_info.stride_c[out_idx] = cute::make_stride(0, cute::Int<1>{}, 0);
-    layout_info.stride_c[out_idx] = cutlass::make_cute_packed_stride(
-        TmaWarpSpecializedGroupedGemmInput::StrideC{}, cute::make_shape(1, gemm_n, 1));
   }
   if (layout_info.fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE) {
-    layout_info.default_epilogue.stride_d[out_idx] = cutlass::make_cute_packed_stride(
-        TmaWarpSpecializedGroupedGemmInput::DefaultEpilogue::StrideD{},
-        cute::make_shape(gemm_n, gemm_m, 1));
+    if (layout_info.swap_ab) {
+      reinterpret_cast<TmaWarpSpecializedGroupedGemmInput::StrideD_T*>(
+          layout_info.stride_d)[out_idx] =
+          cutlass::make_cute_packed_stride(TmaWarpSpecializedGroupedGemmInput::StrideD_T{},
+                                           cute::make_shape(gemm_n, gemm_m, 1));
+    } else {
+      reinterpret_cast<TmaWarpSpecializedGroupedGemmInput::StrideD*>(
+          layout_info.stride_d)[out_idx] =
+          cutlass::make_cute_packed_stride(TmaWarpSpecializedGroupedGemmInput::StrideD{},
+                                           cute::make_shape(gemm_m, gemm_n, 1));
+    }
   }
   if (layout_info.int4_groupwise_params.enabled) {
     layout_info.int4_groupwise_params.stride_s_a[out_idx] = cutlass::make_cute_packed_stride(
@@ -1142,18 +1185,27 @@ __device__ void computeTmaWarpSpecializedInputPointers(
     TmaWarpSpecializedGroupedGemmInput& layout_info, int64_t gemm_m, int64_t gemm_n, int64_t gemm_k,
     int num_tokens_before_expert, int64_t expert, T const* in, WeightType const* weights,
     TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::SFA const* w4a8_weight_scale,
-    ScaleBiasType const* bias, OutputType* output, int64_t const out_idx) {
+    ScaleBiasType const* bias, OutputType* output, float const* router_scales,
+    int const* permuted_row_to_unpermuted_row, int64_t const out_idx) {
   // The input prior to this contains K elements per token, with `num_tokens_before_expert` tokens
-  layout_info.ptr_a[out_idx] = safe_inc_ptr(in, num_tokens_before_expert * gemm_k);
+  layout_info.ptr_act[out_idx] = safe_inc_ptr(in, num_tokens_before_expert * gemm_k);
 
   // Each expert's weight matrix is a constant size NxK, get the matrix at index `expert`
-  layout_info.ptr_b[out_idx] = safe_inc_ptr(weights, expert * (gemm_n * gemm_k));
+  layout_info.ptr_weight[out_idx] = safe_inc_ptr(weights, expert * (gemm_n * gemm_k));
 
   if (layout_info.fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE) {
     // The output prior to this contains N elements per token, with `num_tokens_before_expert`
     // tokens
-    layout_info.default_epilogue.ptr_d[out_idx] =
-        safe_inc_ptr(output, num_tokens_before_expert * gemm_n);
+    layout_info.ptr_d[out_idx] = safe_inc_ptr(output, num_tokens_before_expert * gemm_n);
+  }
+  if (layout_info.fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE) {
+    layout_info.fused_finalize_epilogue.ptr_source_token_index[expert] =
+        permuted_row_to_unpermuted_row + num_tokens_before_expert;
+    layout_info.fused_finalize_epilogue.ptr_router_scales[expert] =
+        router_scales + num_tokens_before_expert;
+    if (layout_info.fused_finalize_epilogue.ptr_bias != nullptr) {
+      layout_info.fused_finalize_epilogue.ptr_bias[expert] = bias + gemm_n * expert;
+    }
   }
   if (layout_info.int4_groupwise_params.enabled) {
     // The group size of wfp4a16 is multiplied by 2 because each scale uses 1 byte instead of 2
@@ -1180,7 +1232,8 @@ __global__ void computeStridesTmaWarpSpecializedKernel(
     TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1,
     TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params,
     ScaleBiasType const* bias1, ScaleBiasType const* bias2, OutputType* gemm1_output,
-    OutputType* gemm2_output) {
+    OutputType* gemm2_output, float const* router_scales,
+    int const* permuted_row_to_unpermuted_row) {
   // First, compute the global tid. We only need 1 thread per expert.
   int const expert = blockIdx.x * blockDim.x + threadIdx.x;
   if (expert >= num_experts_per_node) {
@@ -1199,22 +1252,26 @@ __global__ void computeStridesTmaWarpSpecializedKernel(
 
   // M and N transposed since we are using the #tokens as the N dimension
   layout_info1.shape_info.problem_shapes[expert] =
-      TmaWarpSpecializedGroupedGemmInput::ProblemShape::UnderlyingProblemShape(gemm1_n, gemm_m,
-                                                                               gemm1_k);
+      TmaWarpSpecializedGroupedGemmInput::ProblemShape::UnderlyingProblemShape(
+          layout_info1.swap_ab ? gemm1_n : gemm_m, layout_info1.swap_ab ? gemm_m : gemm1_n,
+          gemm1_k);
   layout_info2.shape_info.problem_shapes[expert] =
-      TmaWarpSpecializedGroupedGemmInput::ProblemShape::UnderlyingProblemShape(gemm2_n, gemm_m,
-                                                                               gemm2_k);
+      TmaWarpSpecializedGroupedGemmInput::ProblemShape::UnderlyingProblemShape(
+          layout_info2.swap_ab ? gemm2_n : gemm_m, layout_info2.swap_ab ? gemm_m : gemm2_n,
+          gemm2_k);
 
   if (layout_info1.int4_groupwise_params.enabled) {
     layout_info1.int4_groupwise_params.shape.problem_shapes[expert] =
         TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::ProblemShapeInt::
-            UnderlyingProblemShape(gemm1_n, gemm_m, gemm1_k);
+            UnderlyingProblemShape(layout_info1.swap_ab ? gemm1_n : gemm_m,
+                                   layout_info1.swap_ab ? gemm_m : gemm1_n, gemm1_k);
   }
 
   if (layout_info2.int4_groupwise_params.enabled) {
     layout_info2.int4_groupwise_params.shape.problem_shapes[expert] =
         TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::ProblemShapeInt::
-            UnderlyingProblemShape(gemm2_n, gemm_m, gemm2_k);
+            UnderlyingProblemShape(layout_info2.swap_ab ? gemm2_n : gemm_m,
+                                   layout_info2.swap_ab ? gemm_m : gemm2_n, gemm2_k);
   }
 
   if (alpha_scale_flat1 && alpha_scale_flat2) {
@@ -1241,9 +1298,6 @@ __global__ void computeStridesTmaWarpSpecializedKernel(
   setupIfSelected(TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaledConfig{},
                   quant_params.mxfp8_mxfp4);
 
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
-#endif
   assert(gemm_m <= INT32_MAX);
   assert(gemm1_n > 0 && gemm1_n <= INT32_MAX);
   assert(gemm1_k > 0 && gemm1_k <= INT32_MAX);
@@ -1256,142 +1310,15 @@ __global__ void computeStridesTmaWarpSpecializedKernel(
       layout_info1, gemm_m, gemm1_n, gemm1_k, num_tokens_before_expert, expert, gemm1_in, weights1,
       reinterpret_cast<TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::SFA const*>(
           quant_params.groupwise.fc1.weight_scales),
-      bias1, gemm1_output, expert);
+      bias1, gemm1_output, nullptr, nullptr, expert);
   computeTmaWarpSpecializedInputPointers(
       layout_info2, gemm_m, gemm2_n, gemm2_k, num_tokens_before_expert, expert, gemm2_in, weights2,
       reinterpret_cast<TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::SFA const*>(
           quant_params.groupwise.fc2.weight_scales),
-      bias2, gemm2_output, expert);
-}
-
-template <class T, class WeightType, class OutputType, class ScaleBiasType>
-__global__ void computeStridesTmaWarpSpecializedLowLatencyKernel(
-    TmaWarpSpecializedGroupedGemmInput layout_info1,
-    TmaWarpSpecializedGroupedGemmInput layout_info2, int64_t num_tokens, int64_t gemm1_n,
-    int64_t gemm1_k, int64_t gemm2_n, int64_t gemm2_k, int64_t const num_experts_per_node,
-    T const* in1, T const* in2, WeightType const* weights1, WeightType const* weights2,
-    float const* alpha_scale_flat1, float const* alpha_scale_flat2,
-    TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1,
-    TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params,
-    ScaleBiasType const* bias1, ScaleBiasType const* bias2, OutputType* output1,
-    OutputType* output2, int const* num_active_experts_per, int const* active_expert_global_ids,
-    int start_expert) {
-  // First, compute the global tid. We only need 1 thread per expert.
-  int const expert = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (expert >= num_experts_per_node) {
-    return;
-  }
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
-#endif
-
-  // Note: expert is used to calculate the offset of the input and output
-  // local_expert is used to calculate the offset of the weight
-  auto const num_tokens_before_expert = expert * num_tokens;
-  bool const is_active_expert = expert < *num_active_experts_per;
-  int const local_expert = is_active_expert ? active_expert_global_ids[expert] - start_expert : -1;
-  auto const gemm_m = is_active_expert ? num_tokens : 0;
-
-  // M and N transposed since we are using the #tokens as the N dimension
-  layout_info1.shape_info.problem_shapes[expert] =
-      TmaWarpSpecializedGroupedGemmInput::ProblemShape::UnderlyingProblemShape(gemm1_n, gemm_m,
-                                                                               gemm1_k);
-  layout_info2.shape_info.problem_shapes[expert] =
-      TmaWarpSpecializedGroupedGemmInput::ProblemShape::UnderlyingProblemShape(gemm2_n, gemm_m,
-                                                                               gemm2_k);
-
-  if (alpha_scale_flat1) {
-    assert(alpha_scale_flat2);
-    if (is_active_expert) {
-      layout_info1.alpha_scale_ptr_array[expert] = alpha_scale_flat1 + local_expert;
-      layout_info2.alpha_scale_ptr_array[expert] = alpha_scale_flat2 + local_expert;
-    } else {
-      layout_info1.alpha_scale_ptr_array[expert] = nullptr;
-      layout_info2.alpha_scale_ptr_array[expert] = nullptr;
-    }
-  }
-
-  if (quant_params.fp4.fc1.weight_block_scale) {
-    setupFP4BlockScalingFactors<TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaledConfig>(
-        layout_info1, expert, gemm_m, gemm1_n, gemm1_k, fp4_act_flat1,
-        quant_params.fp4.fc1.weight_block_scale, num_tokens_before_expert);
-
-    // Override the scaling factors, fc1 uses the same A input for all experts and the scaling
-    // factor B offsets from the local expert index
-    if (is_active_expert) {
-      layout_info1.fpX_block_scaling_factors_A[expert] = fp4_act_flat1;
-      layout_info1.fpX_block_scaling_factors_B[expert] =
-          quant_params.fp4.fc1.weight_block_scale +
-          getOffsetWeightSF(local_expert, gemm1_n, gemm1_k,
-                            TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::NVFP4);
-    } else {
-      layout_info1.fpX_block_scaling_factors_A[expert] = nullptr;
-      layout_info1.fpX_block_scaling_factors_B[expert] = nullptr;
-    }
-  }
-
-  if (quant_params.fp4.fc2.weight_block_scale) {
-    setupFP4BlockScalingFactors<TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaledConfig>(
-        layout_info2, expert, gemm_m, gemm2_n, gemm2_k, fp4_act_flat2,
-        quant_params.fp4.fc2.weight_block_scale, num_tokens_before_expert);
-
-    // Override the scaling factors, fc2 scaling factor B offsets by the local expert index
-    if (is_active_expert) {
-      layout_info2.fpX_block_scaling_factors_B[expert] =
-          quant_params.fp4.fc2.weight_block_scale +
-          getOffsetWeightSF(local_expert, gemm2_n, gemm2_k,
-                            TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::NVFP4);
-    } else {
-      layout_info2.fpX_block_scaling_factors_A[expert] = nullptr;
-      layout_info2.fpX_block_scaling_factors_B[expert] = nullptr;
-    }
-  }
-
+      bias2, gemm2_output, router_scales, permuted_row_to_unpermuted_row, expert);
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
   asm volatile("griddepcontrol.launch_dependents;");
 #endif
-
-  assert(gemm_m <= INT32_MAX);
-  assert(gemm1_n > 0 && gemm1_n <= INT32_MAX);
-  assert(gemm1_k > 0 && gemm1_k <= INT32_MAX);
-  assert(gemm2_n > 0 && gemm2_n <= INT32_MAX);
-  assert(gemm2_k > 0 && gemm2_k <= INT32_MAX);
-  computeTmaWarpSpecializedInputStrides(layout_info1, gemm_m, gemm1_n, gemm1_k, expert);
-  computeTmaWarpSpecializedInputStrides(layout_info2, gemm_m, gemm2_n, gemm2_k, expert);
-
-  if (is_active_expert) {
-    // Note: under low latency mode, we use the same input for all experts
-    // so for gemm1, the inputs are the same,
-    // for gemm2, we use the input generated by gemm1
-    layout_info1.ptr_a[expert] = in1;
-    layout_info2.ptr_a[expert] = safe_inc_ptr(in2, expert * num_tokens * gemm2_k);
-
-    // Each expert's weight matrix is a constant size NxK, get the matrix at index `expert`
-    layout_info1.ptr_b[expert] = safe_inc_ptr(weights1, local_expert * (gemm1_n * gemm2_k));
-    layout_info2.ptr_b[expert] = safe_inc_ptr(weights2, local_expert * (gemm1_n * gemm2_k));
-
-    assert(layout_info1.fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE);
-    layout_info1.default_epilogue.ptr_d[expert] =
-        safe_inc_ptr(output1, expert * num_tokens * gemm1_n);
-
-    if (layout_info2.fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE) {
-      // The output prior to this contains N elements per token, with `num_tokens` tokens
-      layout_info2.default_epilogue.ptr_d[expert] =
-          safe_inc_ptr(output2, expert * num_tokens * gemm2_n);
-    }
-  } else {
-    layout_info1.ptr_a[expert] = nullptr;
-    layout_info2.ptr_a[expert] = nullptr;
-    layout_info1.ptr_b[expert] = nullptr;
-    layout_info2.ptr_b[expert] = nullptr;
-
-    layout_info1.default_epilogue.ptr_d[expert] = nullptr;
-    if (layout_info2.fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE) {
-      layout_info2.default_epilogue.ptr_d[expert] = nullptr;
-    }
-  }
 }
 
 // ========================== Permutation things =======================================
@@ -1426,7 +1353,7 @@ __global__ void expandInputRowsKernel(
     int64_t const k, float const* fc1_act_global_scale, bool use_per_expert_act_scale,
     int64_t const* expert_first_token_offset,
     TmaWarpSpecializedGroupedGemmInput::ElementSF* fc1_act_sf_flat,
-    TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf,
+    TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, bool const swizzled_input_sf,
     int64_t const num_experts_per_node, InputActivationsType const* prequant_scales = nullptr) {
   static_assert(BlockScalingType == TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::NONE ||
                     !PRE_QUANT_AWQ,
@@ -1536,7 +1463,7 @@ __global__ void expandInputRowsKernel(
                  "Cannot use per-expert act scale for pre-quantized activations");
           writeSF<VecSize, ELEM_PER_THREAD>(num_tokens_before_expert, expert, source_row,
                                             permuted_row, elem_index, padded_hidden_size,
-                                            fc1_act_sf_flat, input_sf);
+                                            fc1_act_sf_flat, input_sf, swizzled_input_sf);
           dest_row_ptr[elem_index] = in_vec;
         }
       }
@@ -1632,8 +1559,8 @@ void expandInputRowsKernelLauncher(
     int const k, int const num_experts_per_node, QuantParams const& quant_params,
     bool use_per_expert_act_scale, int64_t* expert_first_token_offset,
     TmaWarpSpecializedGroupedGemmInput::ElementSF* fc1_act_sf_flat,
-    TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, void const* prequant_scales,
-    bool enable_pdl, cudaStream_t stream) {
+    TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, bool const swizzled_input_sf,
+    void const* prequant_scales, bool enable_pdl, cudaStream_t stream) {
 #ifdef ENABLE_FP4
   TLLM_CHECK_WITH_INFO(
       (std::is_same_v<ExpandedActivationsType, __nv_fp4_e2m1> && fc1_act_sf_flat) ||
@@ -1672,7 +1599,7 @@ void expandInputRowsKernelLauncher(
     // Could be either regular FP8 or MXFP8
     else if constexpr (std::is_same_v<ExpandedActivationsType, __nv_fp8_e4m3> &&
                        std::is_same_v<InputActivationsType, __nv_fp8_e4m3>) {
-      TLLM_CHECK_WITH_INFO(!prequant_scales, "NVFP4 is not supported for AWQ");
+      TLLM_CHECK_WITH_INFO(!prequant_scales, "FP8 is not supported for AWQ");
       return quant_params.mxfp8_mxfp4.fc1.weight_block_scale
                  ? &expandInputRowsKernel<
                        InputActivationsType, ExpandedActivationsType,
@@ -1714,21 +1641,22 @@ void expandInputRowsKernelLauncher(
   cudaLaunchKernelEx(&config, func, unpermuted_input, permuted_output, unpermuted_scales,
                      permuted_scales, permuted_row_to_unpermuted_row, num_rows, hidden_size, k,
                      quant_params.fp4.fc1.act_global_scale, use_per_expert_act_scale,
-                     expert_first_token_offset, fc1_act_sf_flat, input_sf, num_experts_per_node,
+                     expert_first_token_offset, fc1_act_sf_flat, input_sf, swizzled_input_sf,
+                     num_experts_per_node,
                      reinterpret_cast<InputActivationsType const*>(prequant_scales));
 }
 
-#define INSTANTIATE_EXPAND_INPUT_ROWS(InputActivationsType, ExpandedActivationsType)              \
-  template void expandInputRowsKernelLauncher<InputActivationsType, ExpandedActivationsType>(     \
-      InputActivationsType const* unpermuted_input, ExpandedActivationsType* permuted_output,     \
-      float const* unpermuted_scales, float* permuted_scales,                                     \
-      int const* permuted_row_to_unpermuted_row, int64_t const num_rows,                          \
-      int64_t const hidden_size, int const k, int const num_experts_per_node,                     \
-      QuantParams const& quant_params, bool use_per_expert_act_scale,                             \
-      int64_t* expert_first_token_offset,                                                         \
-      TmaWarpSpecializedGroupedGemmInput::ElementSF* fc1_act_sf_flat,                             \
-      TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, void const* prequant_scales, \
-      bool enable_pdl, cudaStream_t stream)
+#define INSTANTIATE_EXPAND_INPUT_ROWS(InputActivationsType, ExpandedActivationsType)               \
+  template void expandInputRowsKernelLauncher<InputActivationsType, ExpandedActivationsType>(      \
+      InputActivationsType const* unpermuted_input, ExpandedActivationsType* permuted_output,      \
+      float const* unpermuted_scales, float* permuted_scales,                                      \
+      int const* permuted_row_to_unpermuted_row, int64_t const num_rows,                           \
+      int64_t const hidden_size, int const k, int const num_experts_per_node,                      \
+      QuantParams const& quant_params, bool use_per_expert_act_scale,                              \
+      int64_t* expert_first_token_offset,                                                          \
+      TmaWarpSpecializedGroupedGemmInput::ElementSF* fc1_act_sf_flat,                              \
+      TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, bool const swizzled_input_sf, \
+      void const* prequant_scales, bool enable_pdl, cudaStream_t stream)
 
 // Instantiate the data types that are used by the external pytorch op
 // INSTANTIATE_EXPAND_INPUT_ROWS(float, float);
@@ -1751,22 +1679,24 @@ template <typename OutputType, class GemmOutputType, class ScaleBiasType, ScaleM
 __global__ void finalizeMoeRoutingKernel(
     GemmOutputType const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
     ScaleBiasType const* bias, float const* scales, int const* unpermuted_row_to_permuted_row,
-    int const* token_selected_experts, int64_t const orig_cols, int64_t const experts_per_token,
-    int const num_experts_per_node, int const start_expert_id) {
+    int const* token_selected_experts, int64_t const padded_cols, int64_t const unpadded_cols,
+    int64_t const experts_per_token, int const num_experts_per_node, int const start_expert_id) {
+  assert(padded_cols % 4 == 0);
+  assert(unpadded_cols % 4 == 0);
+  assert(unpadded_cols <= padded_cols);
   int64_t const original_row = blockIdx.x;
   int64_t const num_rows = gridDim.x;
-  auto const offset = original_row * orig_cols;
+  auto const offset = original_row * unpadded_cols;
   OutputType* reduced_row_ptr = reduced_unpermuted_output + offset;
 
   // Load 128-bits per thread, according to the smallest data type we read/write
   constexpr int64_t FINALIZE_ELEM_PER_THREAD =
       128 / std::min(sizeof_bits<OutputType>::value, sizeof_bits<GemmOutputType>::value);
 
-  assert(orig_cols % FINALIZE_ELEM_PER_THREAD == 0);
-
   int64_t const start_offset = threadIdx.x;
   int64_t const stride = FINALIZE_THREADS_PER_BLOCK;
-  int64_t const num_elems_in_col = orig_cols / FINALIZE_ELEM_PER_THREAD;
+  int64_t const num_elems_in_padded_col = padded_cols / FINALIZE_ELEM_PER_THREAD;
+  int64_t const num_elems_in_orig_col = unpadded_cols / FINALIZE_ELEM_PER_THREAD;
 
   using BiasElem = cutlass::Array<ScaleBiasType, FINALIZE_ELEM_PER_THREAD>;
   using InputElem = cutlass::Array<GemmOutputType, FINALIZE_ELEM_PER_THREAD>;
@@ -1781,7 +1711,7 @@ __global__ void finalizeMoeRoutingKernel(
 #endif
 
 #pragma unroll
-  for (int elem_index = start_offset; elem_index < num_elems_in_col; elem_index += stride) {
+  for (int elem_index = start_offset; elem_index < num_elems_in_orig_col; elem_index += stride) {
     ComputeElem thread_output;
     thread_output.fill(0);
     for (int k_idx = 0; k_idx < experts_per_token; ++k_idx) {
@@ -1794,20 +1724,15 @@ __global__ void finalizeMoeRoutingKernel(
       int64_t const expanded_original_row = original_row + k_idx * num_rows;
       int64_t const expanded_permuted_row = unpermuted_row_to_permuted_row[expanded_original_row];
 
-      int64_t expanded_rows = num_rows * experts_per_token;
-      if (expanded_permuted_row < 0 || expanded_permuted_row >= expanded_rows) {
-        continue;
-      }
-
       float const row_scale = (SCALE_MODE == ScaleMode::NO_SCALE) ? 1.f : scales[k_offset];
 
       auto const* expanded_permuted_rows_row_ptr =
-          expanded_permuted_rows_v + expanded_permuted_row * num_elems_in_col;
+          expanded_permuted_rows_v + expanded_permuted_row * num_elems_in_padded_col;
 
       ComputeElem expert_result =
           arrayConvert<InputElem, ComputeElem>(expanded_permuted_rows_row_ptr[elem_index]);
       if (bias) {
-        auto const* bias_ptr = bias_v + expert_id * num_elems_in_col;
+        auto const* bias_ptr = bias_v + expert_id * num_elems_in_padded_col;
         expert_result = expert_result + arrayConvert<BiasElem, ComputeElem>(bias_ptr[elem_index]);
       }
 
@@ -1830,8 +1755,13 @@ __global__ void finalizeMoeRoutingNoFillingKernel(
     GemmOutputType const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
     ScaleBiasType const* bias, float const* scales, int const* const unpermuted_row_to_permuted_row,
     int const* permuted_row_to_unpermuted_row, int const* token_selected_experts,
-    int64_t const* expert_first_token_offset, int64_t const num_rows, int64_t const orig_cols,
-    int64_t const experts_per_token, int const num_experts_per_node, int const start_expert_id) {
+    int64_t const* expert_first_token_offset, int64_t const num_rows, int64_t const padded_cols,
+    int64_t const unpadded_cols, int64_t const experts_per_token, int const num_experts_per_node,
+    int const start_expert_id) {
+  assert(padded_cols % 4 == 0);
+  assert(unpadded_cols % 4 == 0);
+  assert(unpadded_cols <= padded_cols);
+
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
   asm volatile("griddepcontrol.wait;");
 #endif
@@ -1860,17 +1790,16 @@ __global__ void finalizeMoeRoutingNoFillingKernel(
       continue;
     }
 
-    OutputType* reduced_row_ptr = reduced_unpermuted_output + source_row * orig_cols;
+    OutputType* reduced_row_ptr = reduced_unpermuted_output + source_row * unpadded_cols;
 
     // Load 128-bits per thread, according to the smallest data type we read/write
     constexpr int64_t FINALIZE_ELEM_PER_THREAD =
         128 / std::min(sizeof_bits<OutputType>::value, sizeof_bits<GemmOutputType>::value);
 
-    assert(orig_cols % FINALIZE_ELEM_PER_THREAD == 0);
-
     int64_t const start_offset = threadIdx.x;
     int64_t const stride = FINALIZE_THREADS_PER_BLOCK;
-    int64_t const num_elems_in_col = orig_cols / FINALIZE_ELEM_PER_THREAD;
+    int64_t const num_elems_in_padded_col = padded_cols / FINALIZE_ELEM_PER_THREAD;
+    int64_t const num_elems_in_orig_col = unpadded_cols / FINALIZE_ELEM_PER_THREAD;
 
     using BiasElem = cutlass::Array<ScaleBiasType, FINALIZE_ELEM_PER_THREAD>;
     using InputElem = cutlass::Array<GemmOutputType, FINALIZE_ELEM_PER_THREAD>;
@@ -1881,7 +1810,10 @@ __global__ void finalizeMoeRoutingNoFillingKernel(
         reinterpret_cast<InputElem const*>(expanded_permuted_rows);
     auto* reduced_row_ptr_v = reinterpret_cast<OutputElem*>(reduced_row_ptr);
 
-    for (int elem_index = start_offset; elem_index < num_elems_in_col; elem_index += stride) {
+    for (int elem_index = start_offset; elem_index < num_elems_in_padded_col;
+         elem_index += stride) {
+      if (elem_index >= num_elems_in_orig_col) continue;  // Skip writing beyond original columns
+
       ComputeElem thread_output;
       thread_output.fill(0);
       for (int k_idx = 0; k_idx < experts_per_token; ++k_idx) {
@@ -1893,22 +1825,17 @@ __global__ void finalizeMoeRoutingNoFillingKernel(
 
         int64_t const expanded_permuted_row_from_k_idx =
             unpermuted_row_to_permuted_row[source_row + k_idx * num_rows];
-        int64_t valid_tokens = expert_first_token_offset[num_experts_per_node];
-        if (expanded_permuted_row_from_k_idx < 0 ||
-            expanded_permuted_row_from_k_idx >= valid_tokens) {
-          continue;
-        }
 
         float const row_scale = (SCALE_MODE == ScaleMode::NO_SCALE) ? 1.f : scales[k_offset];
 
         auto const* expanded_permuted_rows_row_ptr =
-            expanded_permuted_rows_v + expanded_permuted_row_from_k_idx * num_elems_in_col;
+            expanded_permuted_rows_v + expanded_permuted_row_from_k_idx * num_elems_in_padded_col;
 
         ComputeElem expert_result =
             arrayConvert<InputElem, ComputeElem>(expanded_permuted_rows_row_ptr[elem_index]);
 
         if (bias) {
-          auto const* bias_ptr = bias_v + expert_id * num_elems_in_col;
+          auto const* bias_ptr = bias_v + expert_id * num_elems_in_padded_col;
           expert_result = expert_result + arrayConvert<BiasElem, ComputeElem>(bias_ptr[elem_index]);
         }
 
@@ -1928,10 +1855,10 @@ void finalizeMoeRoutingKernelLauncher(
     GemmOutputType const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
     ScaleBiasType const* bias, float const* final_scales, int const* unpermuted_row_to_permuted_row,
     int const* permuted_row_to_unpermuted_row, int const* token_selected_experts,
-    int64_t const* expert_first_token_offset, int64_t const num_rows, int64_t const cols,
-    int64_t const experts_per_token, int64_t const num_experts_per_node,
-    MOEParallelismConfig parallelism_config, bool const enable_alltoall, bool enable_pdl,
-    cudaStream_t stream) {
+    int64_t const* expert_first_token_offset, int64_t const num_rows, int64_t const padded_cols,
+    int64_t const unpadded_cols, int64_t const experts_per_token,
+    int64_t const num_experts_per_node, MOEParallelismConfig parallelism_config,
+    bool const enable_alltoall, bool enable_pdl, cudaStream_t stream) {
   // Only add bias on rank 0 for tensor parallelism
   bool const is_rank_0 = parallelism_config.tp_rank == 0;
   ScaleBiasType const* bias_ptr = is_rank_0 ? bias : nullptr;
@@ -1962,8 +1889,8 @@ void finalizeMoeRoutingKernelLauncher(
                                                          ScaleMode::NO_SCALE>;
     cudaLaunchKernelEx(&config, func, expanded_permuted_rows, reduced_unpermuted_output, bias_ptr,
                        final_scales, unpermuted_row_to_permuted_row, permuted_row_to_unpermuted_row,
-                       token_selected_experts, expert_first_token_offset, num_rows, cols,
-                       experts_per_token, num_experts_per_node, start_expert_id);
+                       token_selected_experts, expert_first_token_offset, num_rows, padded_cols,
+                       unpadded_cols, experts_per_token, num_experts_per_node, start_expert_id);
   } else {
     // If all-gather reduce-scatter is used, finalizeMoeRouting must fill invalid output tokens with
     // zeros.
@@ -1976,20 +1903,21 @@ void finalizeMoeRoutingKernelLauncher(
                              : &finalizeMoeRoutingKernel<OutputType, GemmOutputType, ScaleBiasType,
                                                          ScaleMode::NO_SCALE>;
     cudaLaunchKernelEx(&config, func, expanded_permuted_rows, reduced_unpermuted_output, bias_ptr,
-                       final_scales, unpermuted_row_to_permuted_row, token_selected_experts, cols,
-                       experts_per_token, num_experts_per_node, start_expert_id);
+                       final_scales, unpermuted_row_to_permuted_row, token_selected_experts,
+                       padded_cols, unpadded_cols, experts_per_token, num_experts_per_node,
+                       start_expert_id);
   }
 }
 
-#define INSTANTIATE_FINALIZE_MOE_ROUTING(OutputT, GemmOutputT, ScaleBiasT)                   \
-  template void finalizeMoeRoutingKernelLauncher<OutputT, GemmOutputT, ScaleBiasT>(          \
-      GemmOutputT const* expanded_permuted_rows, OutputT* reduced_unpermuted_output,         \
-      ScaleBiasT const* bias, float const* final_scales,                                     \
-      int const* expanded_source_row_to_expanded_dest_row,                                   \
-      int const* expanded_dest_row_to_expanded_source_row, int const* expert_for_source_row, \
-      int64_t const* expert_first_token_offset, int64_t const num_rows, int64_t const cols,  \
-      int64_t const experts_per_token, int64_t const num_experts_per_node,                   \
-      MOEParallelismConfig parallelism_config, bool const enable_alltoall, bool enable_pdl,  \
+#define INSTANTIATE_FINALIZE_MOE_ROUTING(OutputT, GemmOutputT, ScaleBiasT)                  \
+  template void finalizeMoeRoutingKernelLauncher<OutputT, GemmOutputT, ScaleBiasT>(         \
+      GemmOutputT const* expanded_permuted_rows, OutputT* reduced_unpermuted_output,        \
+      ScaleBiasT const* bias, float const* final_scales,                                    \
+      int const* unpermuted_row_to_permuted_row, int const* permuted_row_to_unpermuted_row, \
+      int const* expert_for_source_row, int64_t const* expert_first_token_offset,           \
+      int64_t const num_rows, int64_t const padded_cols, int64_t const actual_cols,         \
+      int64_t const experts_per_token, int64_t const num_experts_per_node,                  \
+      MOEParallelismConfig parallelism_config, bool const enable_alltoall, bool enable_pdl, \
       cudaStream_t stream);
 
 // // Instantiate the data types that are used by the external pytorch op
@@ -2172,7 +2100,6 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
   asm volatile("griddepcontrol.wait;");
 #endif
-
   for (int64_t token = blockIdx.x; token < num_valid_tokens; token += gridDim.x) {
     size_t gemm_result_offset = token * inter_size * gated_size_mul;
     size_t output_offset = token * inter_size;
@@ -2188,6 +2115,7 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
       expert =
           findTotalEltsLessThanTarget(expert_first_token_offset, num_experts_per_node, token + 1) -
           1;
+
       gate_alpha = activation_params.swiglu_alpha ? activation_params.swiglu_alpha[expert] : 1.0f;
       gate_beta = activation_params.swiglu_beta ? activation_params.swiglu_beta[expert] : 0.0f;
       gate_limit = activation_params.swiglu_limit ? activation_params.swiglu_limit[expert]
@@ -2245,7 +2173,6 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
                 linear_value + arrayConvert<BiasElem, ComputeElem>(bias_ptr_vec[elem_index]);
           }
           return fn(fc1_value, linear_value);
-
         } else {
           return fn(fc1_value);
         }
@@ -2382,16 +2309,17 @@ void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8
           &doActivationKernel<T, GemmOutputType, ScaleBiasType,
                               IdentityAdaptor<cutlass::epilogue::thread::Identity>,
                               decltype(block_scaling_type)::value>  // Identity
-
       };
       return fn_list[static_cast<int>(activation_type.activation_type)];
     };
+#ifdef ENABLE_FP4
     auto NVFP4 = tensorrt_llm::common::ConstExprWrapper<
         TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType,
         TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::NVFP4>{};
     auto MXFPX = tensorrt_llm::common::ConstExprWrapper<
         TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType,
         TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::MXFPX>{};
+#endif
     auto NONE = tensorrt_llm::common::ConstExprWrapper<
         TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType,
         TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::NONE>{};
@@ -2834,10 +2762,11 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType,
   expert_first_token_offset_ = getWsPtr(int64_t{}, "expert_first_token_offset");
 
   // We check if the provided config uses fused finalize and disable it if it does not
-  bool const gemm2_using_tma_ws = moe_gemm_runner_.isTmaWarpSpecialized(*gemm2_config_);
-  permuted_token_final_scales_ = (gemm2_using_tma_ws && mayHaveFinalizeFused())
-                                     ? getWsPtr(float{}, "permuted_token_final_scales")
-                                     : nullptr;
+  bool gemm2_using_finalize_fusion =
+      gemm2_config_->epilogue_fusion_type ==
+      cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE;
+  permuted_token_final_scales_ =
+      gemm2_using_finalize_fusion ? getWsPtr(float{}, "permuted_token_final_scales") : nullptr;
 
   bool const is_gated_activation = isGatedActivation(activation_type);
   bool const gemm1_using_fused_moe = moe_gemm_runner_.isFusedGatedActivation(
@@ -2979,9 +2908,10 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, ScaleBiasType, Ena
     float const* const unpermuted_final_scales, int const* const unpermuted_row_to_permuted_row,
     int const* const permuted_row_to_unpermuted_row, int const* const token_selected_experts,
     int64_t const* const num_valid_tokens_ptr, int64_t const num_rows,
-    int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size,
-    int64_t const num_experts_per_node, int64_t const k, MOEParallelismConfig parallelism_config,
-    bool const enable_alltoall, QuantParams& quant_params, bool enable_pdl, cudaStream_t stream) {
+    int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const unpadded_hidden_size,
+    int64_t const inter_size, int64_t const num_experts_per_node, int64_t const k,
+    MOEParallelismConfig parallelism_config, bool const enable_alltoall, QuantParams& quant_params,
+    bool enable_pdl, cudaStream_t stream) {
   int shape_n = hidden_size;
   int shape_k = inter_size;
 
@@ -2995,8 +2925,9 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, ScaleBiasType, Ena
   finalizeMoeRoutingKernelLauncher<OutputType, UnfusedGemmOutputType>(
       static_cast<UnfusedGemmOutputType const*>(gemm_output), final_output, fc2_expert_biases,
       unpermuted_final_scales, unpermuted_row_to_permuted_row, permuted_row_to_unpermuted_row,
-      token_selected_experts, expert_first_token_offset, num_rows, hidden_size, k,
-      num_experts_per_node, parallelism_config, enable_alltoall, enable_pdl, stream);
+      token_selected_experts, expert_first_token_offset, num_rows, hidden_size,
+      unpadded_hidden_size, k, num_experts_per_node, parallelism_config, enable_alltoall,
+      enable_pdl, stream);
 }
 
 template <class T, class WeightType, class OutputType, class InputType, class ScaleBiasType,
@@ -3074,7 +3005,6 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
   if (using_tma_ws_gemm1) {
     TLLM_CHECK(config.is_tma_warp_specialized);
     TLLM_CHECK(!use_ampere_activation_fusion);
-
     TLLM_CHECK(!use_fp4 || fc1_fp4_act_flat);
     TLLM_CHECK(!use_fp4 || fc2_fp4_act_flat);
 
@@ -3230,7 +3160,8 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
         static_cast<OutputType*>(use_ampere_activation_fusion ? output : intermediate_result),
         alpha_scale_ptr_array,
         /*occupancy*/ nullptr,
-        use_ampere_activation_fusion ? fc1_activation_type : ActivationType::Identity,
+        use_ampere_activation_fusion ? fc1_activation_type.activation_type
+                                     : ActivationType::Identity,
         expanded_num_rows,
         /*N*/ int64_t(fc1_out_size),
         /*K*/ hidden_size,
@@ -3271,9 +3202,9 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
     int const* const unpermuted_row_to_permuted_row, int const* permuted_row_to_unpermuted_row,
     int const* const token_selected_experts, int64_t const* const num_valid_tokens_ptr,
     int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size,
-    int64_t const inter_size, int const num_experts_per_node, int64_t const k,
-    float const** alpha_scale_ptr_array, bool use_lora, void* fc2_lora, cudaStream_t stream,
-    MOEParallelismConfig parallelism_config, bool const enable_alltoall,
+    int64_t const unpadded_hidden_size, int64_t const inter_size, int const num_experts_per_node,
+    int64_t const k, float const** alpha_scale_ptr_array, bool use_lora, void* fc2_lora,
+    cudaStream_t stream, MOEParallelismConfig parallelism_config, bool const enable_alltoall,
     cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode,
     int* num_active_experts_per, int* active_expert_global_ids, bool enable_pdl) {
   int64_t const* total_tokens_including_expert = expert_first_token_offset + 1;
@@ -3291,9 +3222,9 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
                         expert_first_token_offset, fc2_expert_weights, fc2_expert_biases,
                         unpermuted_final_scales, unpermuted_row_to_permuted_row,
                         permuted_row_to_unpermuted_row, token_selected_experts,
-                        num_valid_tokens_ptr, num_rows, expanded_num_rows, hidden_size, inter_size,
-                        num_experts_per_node, k, parallelism_config, enable_alltoall, quant_params,
-                        enable_pdl, stream);
+                        num_valid_tokens_ptr, num_rows, expanded_num_rows, hidden_size,
+                        unpadded_hidden_size, inter_size, num_experts_per_node, k,
+                        parallelism_config, enable_alltoall, quant_params, enable_pdl, stream);
     return;
   }
 
@@ -3307,8 +3238,8 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
       //
       // This also means it is included in the timing for the profiler, which is probably more
       // representative until we can overlap it
-      check_cuda_error(
-          cudaMemsetAsync(final_output, 0x0, sizeof(OutputType) * num_rows * hidden_size, stream));
+      check_cuda_error(cudaMemsetAsync(
+          final_output, 0x0, sizeof(OutputType) * num_rows * unpadded_hidden_size, stream));
     }
   } else if (use_fp8) {
     alpha_scale_ptr_array = computeFP8DequantScale(alpha_scale_ptr_array, num_experts_per_node,
@@ -3363,22 +3294,24 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
   }
 
   bool has_different_output_type_ampere = (use_w4afp8 || use_fp8) && !using_tma_ws_gemm2;
-  bool using_hopper_fused_finalize =
+  bool using_fused_finalize =
       tma_ws_input.fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE;
-  bool has_different_output_type_tma_ws = !using_hopper_fused_finalize && using_tma_ws_gemm2;
+  bool has_different_output_type_tma_ws = !using_fused_finalize && using_tma_ws_gemm2;
 
   if (has_different_output_type_ampere || has_different_output_type_tma_ws) {
     finalizeMoeRoutingKernelLauncher<OutputType, UnfusedGemmOutputType>(
         static_cast<UnfusedGemmOutputType const*>(gemm_output), final_output, fc2_expert_biases,
         unpermuted_final_scales, unpermuted_row_to_permuted_row, permuted_row_to_unpermuted_row,
-        token_selected_experts, expert_first_token_offset, num_rows, hidden_size, k,
-        num_experts_per_node, parallelism_config, enable_alltoall, enable_pdl, stream);
+        token_selected_experts, expert_first_token_offset, num_rows, hidden_size,
+        unpadded_hidden_size, k, num_experts_per_node, parallelism_config, enable_alltoall,
+        enable_pdl, stream);
   } else if (!using_tma_ws_gemm2) {
     finalizeMoeRoutingKernelLauncher<OutputType, T>(
         static_cast<T const*>(gemm_output), final_output, fc2_expert_biases,
         unpermuted_final_scales, unpermuted_row_to_permuted_row, permuted_row_to_unpermuted_row,
-        token_selected_experts, expert_first_token_offset, num_rows, hidden_size, k,
-        num_experts_per_node, parallelism_config, enable_alltoall, enable_pdl, stream);
+        token_selected_experts, expert_first_token_offset, num_rows, hidden_size,
+        unpadded_hidden_size, k, num_experts_per_node, parallelism_config, enable_alltoall,
+        enable_pdl, stream);
   }
   sync_check_cuda_error(stream);
 }
@@ -3600,16 +3533,16 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
 template <class T, class WeightType, class OutputType, class InputType, class BackBoneType,
           class Enable>
 void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::runMoe(
-    void const* input_activations_void, void const* input_sf_void,
+    void const* input_activations_void, void const* input_sf_void, bool const swizzled_input_sf,
     int const* token_selected_experts, float const* token_final_scales,
     void const* fc1_expert_weights_void, void const* fc1_expert_biases_void,
     ActivationParams fc1_activation_type, void const* fc2_expert_weights_void,
     void const* fc2_expert_biases_void, QuantParams quant_params, int64_t const num_rows,
-    int64_t const hidden_size, int64_t const inter_size, int const full_num_experts,
-    int const experts_per_token, char* workspace_ptr, void* final_output_void,
-    int* unpermuted_row_to_permuted_row, MOEParallelismConfig parallelism_config,
-    bool const enable_alltoall, bool use_lora, LoraParams& lora_params,
-    bool use_deepseek_fp8_block_scale, bool min_latency_mode,
+    int64_t const hidden_size, int64_t const unpadded_hidden_size, int64_t const inter_size,
+    int const full_num_experts, int const experts_per_token, char* workspace_ptr,
+    void* final_output_void, int* unpermuted_row_to_permuted_row,
+    MOEParallelismConfig parallelism_config, bool const enable_alltoall, bool use_lora,
+    LoraParams& lora_params, bool use_deepseek_fp8_block_scale, bool min_latency_mode,
     MoeMinLatencyParams& min_latency_params, bool enable_pdl, cudaStream_t stream) {
   static constexpr bool int_scales_required = std::is_same<WeightType, uint8_t>::value ||
                                               std::is_same<WeightType, cutlass::uint4b_t>::value ||
@@ -3664,6 +3597,27 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
         "Inter size %d does not meet minimum alignment requirements for MXFP8_MXFP4 MOE GEMM %d",
         (int)inter_size, (int)(64 * 8 / sizeof_bits<WeightType>::value));
   } else {
+    // For NoSmem epilogue schedule, we need to align the output of the GEMM to 256 bits, for gated
+    // activation this is automatic if the usual alignment requirement is met
+    if (gemm1_config_->epilogue_schedule == cutlass_extensions::EpilogueScheduleType::NO_SMEM &&
+        !isGatedActivation(fc1_activation_type)) {
+      TLLM_CHECK_WITH_INFO(
+          inter_size % (256 / sizeof_bits<WeightType>::value) == 0,
+          "Inter size %d does not meet minimum alignment requirements for MOE GEMM %d",
+          (int)inter_size, (int)(256 / sizeof_bits<WeightType>::value));
+    }
+
+    if (gemm2_config_->epilogue_schedule == cutlass_extensions::EpilogueScheduleType::NO_SMEM) {
+      TLLM_CHECK_WITH_INFO(
+          gemm2_config_->epilogue_fusion_type !=
+              cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE,
+          "Got NoSmem epilogue schedule, which is not supported for finalize fusion");
+      TLLM_CHECK_WITH_INFO(
+          hidden_size % (256 / sizeof_bits<WeightType>::value) == 0,
+          "Hidden size %d does not meet minimum alignment requirements for MOE GEMM %d",
+          (int)hidden_size, (int)(256 / sizeof_bits<WeightType>::value));
+    }
+
     // Require at least 128 bits of alignment for MOE GEMM
     TLLM_CHECK_WITH_INFO(
         hidden_size % (128 / sizeof_bits<WeightType>::value) == 0,
@@ -3755,10 +3709,11 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
     sync_check_cuda_error(stream);
 
     auto [gemm1_tma_ws_input, gemm2_tma_ws_input] = setupTmaWarpSpecializedInputs(
-        num_rows, expanded_num_rows, fc1_activation_type, hidden_size, inter_size,
-        num_experts_per_node, input_activations_void, input_sf, final_output, fc1_expert_weights,
-        fc2_expert_weights, quant_params, fc1_expert_biases, fc2_expert_biases, min_latency_mode,
-        min_latency_params, use_lora, start_expert, parallelism_config, enable_pdl, stream);
+        num_rows, expanded_num_rows, fc1_activation_type, hidden_size, unpadded_hidden_size,
+        inter_size, num_experts_per_node, input_activations_void, input_sf, final_output,
+        fc1_expert_weights, fc2_expert_weights, quant_params, fc1_expert_biases, fc2_expert_biases,
+        min_latency_mode, min_latency_params, use_lora, start_expert, parallelism_config,
+        enable_pdl, stream);
 
     // todo: input_activations_void should be nvfp4, waiting for yuxian's mr ready
     Self::gemm1(moe_gemm_runner_, blockscale_gemm_runner,
@@ -3776,16 +3731,17 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
     auto gemm2_input =
         applyPrequantScale(smoothed_act_, fc1_result_, quant_params.groupwise.fc2.act_scales,
                            num_valid_tokens_ptr, expanded_num_rows, inter_size, use_awq, stream);
-    Self::gemm2(
-        moe_gemm_runner_, blockscale_gemm_runner, gemm2_input, final_output, nullptr,
-        expert_first_token_offset_, gemm2_tma_ws_input, fc2_expert_weights, fc2_expert_biases,
-        fc2_int_scales, fc2_fp8_dequant, fc2_fp4_act_scale_, quant_params,
-        token_topk_unpermuted_scales, permuted_token_final_scales_, unpermuted_row_to_permuted_row,
-        permuted_row_to_unpermuted_row_, token_selected_experts, num_valid_tokens_ptr, num_rows,
-        expanded_num_rows, hidden_size, inter_size, num_experts_per_node, experts_per_token,
-        alpha_scale_ptr_array_fc2_, use_lora, lora_fc2_result_, stream, parallelism_config,
-        enable_alltoall, *gemm2_config_, true, min_latency_params.num_active_experts_per_node,
-        min_latency_params.active_expert_global_ids, enable_pdl);
+    Self::gemm2(moe_gemm_runner_, blockscale_gemm_runner, gemm2_input, final_output, nullptr,
+                expert_first_token_offset_, gemm2_tma_ws_input, fc2_expert_weights,
+                fc2_expert_biases, fc2_int_scales, fc2_fp8_dequant, fc2_fp4_act_scale_,
+                quant_params, token_topk_unpermuted_scales, permuted_token_final_scales_,
+                unpermuted_row_to_permuted_row, permuted_row_to_unpermuted_row_,
+                token_selected_experts, num_valid_tokens_ptr, num_rows, expanded_num_rows,
+                hidden_size, unpadded_hidden_size, inter_size, num_experts_per_node,
+                experts_per_token, alpha_scale_ptr_array_fc2_, use_lora, lora_fc2_result_, stream,
+                parallelism_config, enable_alltoall, *gemm2_config_, true,
+                min_latency_params.num_active_experts_per_node,
+                min_latency_params.active_expert_global_ids, enable_pdl);
     sync_check_cuda_error(stream);
   } else {
     bool fused_prologue_result = false;
@@ -3834,17 +3790,19 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
         input_activations, gemm1_input_expand, token_topk_unpermuted_scales,
         permuted_token_final_scales_, permuted_row_to_unpermuted_row_, num_rows, hidden_size,
         experts_per_token, num_experts_per_node, quant_params, use_per_expert_act_scale,
-        expert_first_token_offset_, fc1_fp4_act_scale_, input_sf,
-        use_w4afp8 ? quant_params.groupwise.fc1.act_scales : nullptr, enable_pdl, stream);
+        expert_first_token_offset_, fc1_fp4_act_scale_, input_sf, swizzled_input_sf,
+        (use_w4afp8 && !use_fp8_input) ? quant_params.groupwise.fc1.act_scales : nullptr,
+        enable_pdl, stream);
     auto const* gemm1_input = gemm1_input_expand;
 
     sync_check_cuda_error(stream);
 
     auto [gemm1_tma_ws_input, gemm2_tma_ws_input] = setupTmaWarpSpecializedInputs(
-        num_rows, expanded_num_rows, fc1_activation_type, hidden_size, inter_size,
-        num_experts_per_node, input_activations_void, input_sf, final_output, fc1_expert_weights,
-        fc2_expert_weights, quant_params, fc1_expert_biases, fc2_expert_biases, min_latency_mode,
-        min_latency_params, use_lora, start_expert, parallelism_config, enable_pdl, stream);
+        num_rows, expanded_num_rows, fc1_activation_type, hidden_size, unpadded_hidden_size,
+        inter_size, num_experts_per_node, input_activations_void, input_sf, final_output,
+        fc1_expert_weights, fc2_expert_weights, quant_params, fc1_expert_biases, fc2_expert_biases,
+        min_latency_mode, min_latency_params, use_lora, start_expert, parallelism_config,
+        enable_pdl, stream);
 
     if (use_lora) {
       bool all_token_without_lora = setupLoraWorkspace(
@@ -3888,15 +3846,15 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
         applyPrequantScale(smoothed_act_, fc1_result_, quant_params.groupwise.fc2.act_scales,
                            num_valid_tokens_ptr, expanded_num_rows, inter_size, use_awq, stream);
     sync_check_cuda_error(stream);
-    Self::gemm2(moe_gemm_runner_, blockscale_gemm_runner, gemm2_input, fc2_result_, final_output,
-                expert_first_token_offset_, gemm2_tma_ws_input, fc2_expert_weights,
-                fc2_expert_biases, fc2_int_scales, fc2_fp8_dequant, fc2_fp4_act_scale_,
-                quant_params, token_topk_unpermuted_scales, permuted_token_final_scales_,
-                unpermuted_row_to_permuted_row, permuted_row_to_unpermuted_row_,
-                token_selected_experts, num_valid_tokens_ptr, num_rows, expanded_num_rows,
-                hidden_size, inter_size, num_experts_per_node, experts_per_token,
-                alpha_scale_ptr_array_fc2_, use_lora, lora_fc2_result_, stream, parallelism_config,
-                enable_alltoall, *gemm2_config_, false, nullptr, nullptr, enable_pdl);
+    Self::gemm2(
+        moe_gemm_runner_, blockscale_gemm_runner, gemm2_input, fc2_result_, final_output,
+        expert_first_token_offset_, gemm2_tma_ws_input, fc2_expert_weights, fc2_expert_biases,
+        fc2_int_scales, fc2_fp8_dequant, fc2_fp4_act_scale_, quant_params,
+        token_topk_unpermuted_scales, permuted_token_final_scales_, unpermuted_row_to_permuted_row,
+        permuted_row_to_unpermuted_row_, token_selected_experts, num_valid_tokens_ptr, num_rows,
+        expanded_num_rows, hidden_size, unpadded_hidden_size, inter_size, num_experts_per_node,
+        experts_per_token, alpha_scale_ptr_array_fc2_, use_lora, lora_fc2_result_, stream,
+        parallelism_config, enable_alltoall, *gemm2_config_, false, nullptr, nullptr, enable_pdl);
     sync_check_cuda_error(stream);
   }
 }
@@ -3915,7 +3873,8 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
         TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1,
         TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2,
         QuantParams quant_params, ScaleBiasType const* bias1, ScaleBiasType const* bias2,
-        UnfusedGemmOutputType* gemm1_output, UnfusedGemmOutputType* gemm2_output, bool enable_pdl,
+        UnfusedGemmOutputType* gemm1_output, UnfusedGemmOutputType* gemm2_output,
+        float const* router_scales, int const* permuted_row_to_unpermuted_row, bool enable_pdl,
         cudaStream_t stream) {
   // Always nullptr
   layout_info1.ptr_c = nullptr;
@@ -3923,6 +3882,11 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
   layout_info2.ptr_c = nullptr;
   layout_info2.stride_c = nullptr;
 
+  layout_info1.fused_finalize_epilogue.ptr_bias = nullptr;
+  if (!bias2) {
+    layout_info2.fused_finalize_epilogue.ptr_bias = nullptr;
+  }
+
   auto alpha_scale_flat1 = use_fp4        ? quant_params.fp4.fc1.global_scale
                            : use_wfp4afp8 ? quant_params.fp8_mxfp4.fc1.global_scale
                            : use_fp8      ? fp8_dequant1
@@ -3964,7 +3928,8 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
                      layout_info2, num_tokens, expanded_num_tokens, gemm1_n, gemm1_k, gemm2_n,
                      gemm2_k, num_experts_per_node, gemm1_in, gemm2_in, weights1, weights2,
                      alpha_scale_flat1, alpha_scale_flat2, fp4_act_flat1, fp4_act_flat2,
-                     quant_params, bias1, bias2, gemm1_output, gemm2_output);
+                     quant_params, bias1, bias2, gemm1_output, gemm2_output, router_scales,
+                     permuted_row_to_unpermuted_row);
 
   return std::make_pair(layout_info1, layout_info2);
 }
@@ -3985,55 +3950,7 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
         UnfusedGemmOutputType* output1, UnfusedGemmOutputType* output2,
         int const* num_active_experts_per, int const* active_expert_global_ids, int start_expert,
         bool enable_pdl, cudaStream_t stream) {
-  TLLM_CHECK_WITH_INFO(!use_w4_groupwise,
-                       "W4AFP8 and WFP4A16 are not supported in low latency mode");
-
-  // Always nullptr
-  layout_info1.ptr_c = nullptr;
-  layout_info1.stride_c = nullptr;
-  layout_info2.ptr_c = nullptr;
-  layout_info2.stride_c = nullptr;
-
-  auto alpha_scale_flat1 = use_fp4        ? quant_params.fp4.fc1.global_scale
-                           : use_wfp4afp8 ? quant_params.fp8_mxfp4.fc1.global_scale
-                                          : fp8_dequant1;
-  auto alpha_scale_flat2 = use_fp4        ? quant_params.fp4.fc2.global_scale
-                           : use_wfp4afp8 ? quant_params.fp8_mxfp4.fc2.global_scale
-                                          : fp8_dequant2;
-  if (!alpha_scale_flat1) {
-    layout_info1.alpha_scale_ptr_array = nullptr;
-  }
-  if (!alpha_scale_flat2) {
-    layout_info2.alpha_scale_ptr_array = nullptr;
-  }
-
-  layout_info1.int4_groupwise_params.enabled = false;
-  layout_info2.int4_groupwise_params.enabled = false;
-  layout_info1.int4_groupwise_params.use_wfp4a16 = false;
-  layout_info2.int4_groupwise_params.use_wfp4a16 = false;
-
-  int const threads = std::min(1024, num_experts);
-  int const blocks = (num_experts + threads - 1) / threads;
-
-  cudaLaunchConfig_t config;
-  config.gridDim = blocks;
-  config.blockDim = threads;
-  config.dynamicSmemBytes = 0;
-  config.stream = stream;
-  cudaLaunchAttribute attrs[1];
-  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
-  config.numAttrs = 1;
-  config.attrs = attrs;
-  cudaLaunchKernelEx(
-      &config,
-      computeStridesTmaWarpSpecializedLowLatencyKernel<T, WeightType, OutputType, ScaleBiasType>,
-      layout_info1, layout_info2, num_tokens, gemm1_n, gemm1_k, gemm2_n, gemm2_k, num_experts,
-      input1, input2, weights1, weights2, alpha_scale_flat1, alpha_scale_flat2, fc1_fp4_act_flat,
-      fc2_fp4_act_flat, quant_params, bias1, bias2, output1, output2, num_active_experts_per,
-      active_expert_global_ids, start_expert);
-
-  return std::make_pair(layout_info1, layout_info2);
+  TLLM_THROW("Min latency mode is no longer supported");
 }
 
 template <class T, class WeightType, class OutputType, class InputType, class BackBoneType,
@@ -4042,8 +3959,8 @@ std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput
 CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
     setupTmaWarpSpecializedInputs(int64_t num_rows, int64_t expanded_num_rows,
                                   ActivationParams fc1_activation_type, int64_t hidden_size,
-                                  int64_t inter_size, int64_t num_experts_per_node,
-                                  void const* input_activations_void,
+                                  int64_t unpadded_hidden_size, int64_t inter_size,
+                                  int64_t num_experts_per_node, void const* input_activations_void,
                                   TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf,
                                   void* final_output, WeightType const* fc1_expert_weights,
                                   WeightType const* fc2_expert_weights, QuantParams quant_params,
@@ -4081,6 +3998,8 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
 
     gemm1_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE;
     gemm2_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE;
+    gemm1_tma_ws_input.swap_ab = true;
+    gemm2_tma_ws_input.swap_ab = true;
 
     TLLM_CHECK_WITH_INFO(gemm1_input != gemm1_output, "Input and output buffers are overlapping");
     return Self::computeStridesTmaWarpSpecializedLowLatency(
@@ -4098,17 +4017,28 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
     gemm1_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE;
     gemm2_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE;
 
+    gemm1_tma_ws_input.swap_ab = gemm1_config_->swap_ab;
+    gemm2_tma_ws_input.swap_ab = gemm2_config_->swap_ab;
+    TLLM_CHECK_WITH_INFO(
+        (gemm1_tma_ws_input.swap_ab && gemm2_tma_ws_input.swap_ab) || !use_w4_groupwise,
+        "Hopper w4 mixed input groupwise requires swap_ab");
+
     bool apply_bias = parallelism_config.tp_rank == 0;
-    bool using_hopper_fused_finalize = !use_deterministic_hopper_reduce_ &&
-                                       gemm2_config_->sm_version == 90 && !use_w4_groupwise &&
-                                       !use_lora;
-    if (using_hopper_fused_finalize) {
+    auto* fc2_bias = apply_bias ? fc2_expert_biases : nullptr;
+    bool gemm2_using_finalize_fusion =
+        gemm2_config_->epilogue_fusion_type ==
+        cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE;
+    bool using_fused_finalize =
+        use_fused_finalize_ && gemm2_using_finalize_fusion && !use_w4_groupwise && !use_lora;
+    TLLM_CHECK_WITH_INFO(
+        using_fused_finalize == gemm2_using_finalize_fusion,
+        "GEMM2 tactic requests finalize fusion, but the runner is not configured to use it");
+    if (using_fused_finalize) {
       assert(min_latency_mode == false);
+      bool use_reduction = expanded_num_rows > num_rows;
       gemm2_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE;
-      gemm2_tma_ws_input.setFinalizeFusionParams(
-          final_output, permuted_token_final_scales_, expert_first_token_offset_,
-          permuted_row_to_unpermuted_row_, apply_bias ? fc2_expert_biases : nullptr, hidden_size,
-          num_rows);
+      gemm2_tma_ws_input.setFinalizeFusionParams(final_output, unpadded_hidden_size, num_rows,
+                                                 use_reduction);
     }
 
     // fp8_mxfp4 memsets the scaling factors to 1.0f
@@ -4120,14 +4050,10 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
                            "WFP4AFP8 expects the scaling factors to be aliased for gemm1 & gemm2");
 
       TmaWarpSpecializedGroupedGemmInput::MXFPXElementSF weight_block_scale_value_int{};
-#ifdef ENABLE_FP8
-#if CUDA_VERSION >= 12080
+#if defined(FLASHINFER_ENABLE_FP8_E8M0) && CUDART_VERSION >= 12080
       __nv_fp8_e8m0 tmp;
       tmp.__x = __nv_cvt_float_to_e8m0(1.0f, __NV_SATFINITE, cudaRoundPosInf);
       std::memcpy(&weight_block_scale_value_int, &tmp, sizeof(tmp));
-#else
-      TLLM_CHECK_WITH_INFO(false, "WFP4AFP8 is not supported on CUDA ");
-#endif
 #endif
 
       auto act_sf_rows = std::min(expanded_num_rows, num_rows * num_experts_per_node);
@@ -4150,9 +4076,9 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
         reinterpret_cast<T const*>(gemm1_input), reinterpret_cast<T const*>(gemm2_input),
         fc1_expert_weights, fc2_expert_weights, quant_params.fp8.dequant_fc1,
         quant_params.fp8.dequant_fc2, fc1_fp4_act_scale_, fc2_fp4_act_scale_, quant_params,
-        fc1_expert_biases, fc2_expert_biases,
-        reinterpret_cast<UnfusedGemmOutputType*>(gemm1_output),
-        reinterpret_cast<UnfusedGemmOutputType*>(fc2_result_), enable_pdl, stream);
+        fc1_expert_biases, fc2_bias, reinterpret_cast<UnfusedGemmOutputType*>(gemm1_output),
+        reinterpret_cast<UnfusedGemmOutputType*>(fc2_result_), permuted_token_final_scales_,
+        permuted_row_to_unpermuted_row_, enable_pdl, stream);
   }
 }
 
@@ -4412,7 +4338,7 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
   if (is_tma_ws_input) {
     tma_ws_input_workspace_size =
         TmaWarpSpecializedGroupedGemmInput::workspaceSize(num_experts_per_node, mScalingType) *
-        (NUM_ROUTING_SAMPLES + 1);
+        (NUM_ROUTING_SAMPLES * NUM_FUSION_TYPES * NUM_SWAP_AB_TYPES + 1);
 
     if (is_w4afp8_quant || is_wfp4a16_quant) {
       quant_3_size = 0;
@@ -4509,7 +4435,6 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
   ADD(swiglu_alpha);
   ADD(swiglu_beta);
   ADD(swiglu_limit);
-
 #undef ADD_NAME
 #undef ADD
 
@@ -4640,13 +4565,32 @@ void GemmProfilerBackend::prepareQuantParams(int num_tokens, char* workspace_ptr
   }
 }
 
-void GemmProfilerBackend::prepareTmaWsInputs(int num_tokens, char* workspace_ptr_char,
-                                             void const* expert_weights, bool enable_pdl,
-                                             cudaStream_t stream) {
+void GemmProfilerBackend::prepareTmaWsInputs(
+    int num_tokens, char* workspace_ptr_char, void const* expert_weights,
+    TmaWarpSpecializedGroupedGemmInput::EpilogueFusion fusion, bool swap_ab, bool enable_pdl,
+    cudaStream_t stream) {
   if (mSM < 90) {
     return;
   }
 
+  bool use_w4afp8 = (mDType == nvinfer1::DataType::kFP8 && mWType == nvinfer1::DataType::kINT4);
+  bool use_wfp4a16 =
+      ((mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16) &&
+       mWType == nvinfer1::DataType::kUINT8);
+  bool use_w4_groupwise = use_w4afp8 || use_wfp4a16;
+  bool const use_finalize_fusion =
+      fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE;
+  bool const finalize_fusion_not_supported = !mInterface->use_fused_finalize_ || mMinLatencyMode ||
+                                             use_w4_groupwise ||
+                                             mGemmToProfile != GemmToProfile::GEMM_2;
+  if (use_finalize_fusion && finalize_fusion_not_supported) {
+    return;
+  }
+
+  if (use_w4_groupwise && !swap_ab) {
+    return;
+  }
+
   auto workspaces = getProfilerWorkspaces(num_tokens, mSM >= 90);
 
 #define GET_WS_PTR(type, name)                                                                 \
@@ -4684,11 +4628,19 @@ void GemmProfilerBackend::prepareTmaWsInputs(int num_tokens, char* workspace_ptr
   dummy_tma_ws_input.enable_pdl = enable_pdl;  // Set enable_pdl for dummy input
   tma_ws_input_workspace += tma_ws_size;
 
+  int workspace_index =
+      static_cast<int>(use_finalize_fusion) * (NUM_SWAP_AB_TYPES * NUM_ROUTING_SAMPLES) +
+      static_cast<int>(swap_ab) * NUM_ROUTING_SAMPLES;
+  tma_ws_input_workspace += workspace_index * tma_ws_size;
+
   size_t num_expanded_tokens = num_tokens * mK;
   for (int64_t i = 0; i < NUM_ROUTING_SAMPLES; i++) {
-    mTmaInputCache[i].configureWorkspace(tma_ws_input_workspace, mNumExpertsPerNode, gemm_workspace,
-                                         workspaces.at("gemm_workspace").first, mScalingType);
-    mTmaInputCache[i].enable_pdl = enable_pdl;  // Set enable_pdl for the profiler
+    // Note: Even though we have separate TMA WS inputs for finalize fusion on/off we reuse the same
+    // pointers to save space.
+    auto& cache_element = mTmaInputCache[use_finalize_fusion][swap_ab][i];
+    cache_element.configureWorkspace(tma_ws_input_workspace, mNumExpertsPerNode, gemm_workspace,
+                                     workspaces.at("gemm_workspace").first, mScalingType);
+    cache_element.enable_pdl = enable_pdl;  // Set enable_pdl for cache element
     tma_ws_input_workspace += tma_ws_size;
 
     int64_t* expert_first_token_offset =
@@ -4697,34 +4649,27 @@ void GemmProfilerBackend::prepareTmaWsInputs(int num_tokens, char* workspace_ptr
         permuted_row_to_unpermuted_row_base + i * num_expanded_tokens;
 
     auto& gemm1_tma_ws_input =
-        mGemmToProfile == GemmToProfile::GEMM_1 ? mTmaInputCache[i] : dummy_tma_ws_input;
+        mGemmToProfile == GemmToProfile::GEMM_1 ? cache_element : dummy_tma_ws_input;
     auto& gemm2_tma_ws_input =
-        mGemmToProfile == GemmToProfile::GEMM_2 ? mTmaInputCache[i] : dummy_tma_ws_input;
+        mGemmToProfile == GemmToProfile::GEMM_2 ? cache_element : dummy_tma_ws_input;
     if (mSM >= 90) {
+      auto fc1_output_size =
+          isGatedActivation(mActivationType) ? mExpertInterSize * 2 : mExpertInterSize;
+
       /* GEMM1 */
       gemm1_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE;
       gemm2_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE;
 
-      bool apply_bias = true;
-      bool use_w4afp8 = (mDType == nvinfer1::DataType::kFP8 && mWType == nvinfer1::DataType::kINT4);
-      bool use_wfp4a16 =
-          ((mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16) &&
-           mWType == nvinfer1::DataType::kUINT8);
-      bool use_w4_groupwise = use_w4afp8 || use_wfp4a16;
+      gemm1_tma_ws_input.swap_ab = swap_ab;
+      gemm2_tma_ws_input.swap_ab = swap_ab;
 
-      bool using_fused_finalize = !mInterface->use_deterministic_hopper_reduce_ && mSM == 90 &&
-                                  !mMinLatencyMode && !use_w4_groupwise;
-      if (using_fused_finalize) {
+      if (use_finalize_fusion) {
         assert(!mMinLatencyMode);
         gemm2_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE;
-        gemm2_tma_ws_input.setFinalizeFusionParams(
-            output, token_topk_unpermuted_scales, expert_first_token_offset,
-            permuted_row_to_unpermuted_row, apply_bias ? bias : nullptr, mExpertHiddenSize,
-            num_tokens);
+        gemm2_tma_ws_input.setFinalizeFusionParams(output, mExpertUnpaddedHiddenSize, num_tokens,
+                                                   mK > 1);
       }
 
-      auto fc1_output_size =
-          isGatedActivation(mActivationType) ? mExpertInterSize * 2 : mExpertInterSize;
       if (mMinLatencyMode) {
         std::tie(gemm1_tma_ws_input, gemm2_tma_ws_input) =
             mInterface->computeStridesTmaWarpSpecializedLowLatencyDispatch(
@@ -4742,7 +4687,7 @@ void GemmProfilerBackend::prepareTmaWsInputs(int num_tokens, char* workspace_ptr
                 mExpertInterSize, mNumExpertsPerNode, input, input, weights_sel, weights_sel,
                 mQuantParams.fp8.dequant_fc1, mQuantParams.fp8.dequant_fc2, fp4_act_scale_flat,
                 fp4_act_scale_flat, mQuantParams, nullptr, nullptr, intermediate, intermediate,
-                enable_pdl, stream);
+                token_topk_unpermuted_scales, permuted_row_to_unpermuted_row, enable_pdl, stream);
       }
       sync_check_cuda_error(stream);
     }
@@ -4752,7 +4697,6 @@ void GemmProfilerBackend::prepareTmaWsInputs(int num_tokens, char* workspace_ptr
 void GemmProfilerBackend::prepare(int num_tokens, char* workspace_ptr_char,
                                   void const* expert_weights, bool enable_pdl,
                                   cudaStream_t stream) {
-  mAllTacticsSaved = mInterface->getTactics();
   mSampleIndex = 0;
 
   auto workspace_size = getWorkspaceSize(num_tokens);
@@ -4760,7 +4704,13 @@ void GemmProfilerBackend::prepare(int num_tokens, char* workspace_ptr_char,
 
   prepareRouting(num_tokens, workspace_ptr_char, enable_pdl, stream);
   prepareQuantParams(num_tokens, workspace_ptr_char, stream);
-  prepareTmaWsInputs(num_tokens, workspace_ptr_char, expert_weights, enable_pdl, stream);
+  for (auto fusion : {TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE,
+                      TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE}) {
+    for (auto swap_ab : {false, true}) {
+      prepareTmaWsInputs(num_tokens, workspace_ptr_char, expert_weights, fusion, swap_ab,
+                         enable_pdl, stream);
+    }
+  }
 }
 
 size_t GemmProfilerBackend::getWorkspaceSize(int maxM) {
@@ -4827,54 +4777,95 @@ void GemmProfilerBackend::runProfiler(int original_num_tokens, Config const& tac
 
   TmaWarpSpecializedGroupedGemmInput tma_ws_input_template;
   if (tactic.is_tma_warp_specialized) {
-    tma_ws_input_template = mTmaInputCache[mSampleIndex];
+    // Use non-finalize cache when finalize fusion is not supported for the current GEMM
+    bool use_w4afp8 = (mDType == nvinfer1::DataType::kFP8 && mWType == nvinfer1::DataType::kINT4);
+    bool use_wfp4a16 =
+        ((mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16) &&
+         mWType == nvinfer1::DataType::kUINT8);
+    bool use_w4_groupwise = use_w4afp8 || use_wfp4a16;
+    bool finalize_supported_this_gemm = (mGemmToProfile == GemmToProfile::GEMM_2) &&
+                                        mInterface->use_fused_finalize_ && !mMinLatencyMode &&
+                                        !use_w4_groupwise;
+    bool request_finalize = tactic.epilogue_fusion_type ==
+                            cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE;
+    bool use_finalize_index = request_finalize && finalize_supported_this_gemm;
+
+    tma_ws_input_template = mTmaInputCache[use_finalize_index][tactic.swap_ab][mSampleIndex];
+    TLLM_CHECK_WITH_INFO(tma_ws_input_template.isValid(),
+                         "TMA WS input template is not initialized");
   }
 
   mInterface->is_profiler = true;
   if (mGemmToProfile == GemmToProfile::GEMM_1) {
-    mInterface->gemm1(
-        input,                                             //
-        output,                                            //
-        intermediate,                                      //
-        expert_first_token_offset,                         //
-        tma_ws_input_template,                             //
-        weights_sel,                                       //
-        bias,                                              //
-        expert_first_token_offset + num_experts_per_node,  //
-        mQuantParams.wo.fc1_weight_scales,                 //
-        mQuantParams.fp8.dequant_fc1,                      //
-        mQuantParams.fp8_mxfp4.fc2.act_global_scale ? mQuantParams.fp8_mxfp4.fc2.act_global_scale
-                                                    : mQuantParams.fp8.quant_fc2,    //
-        fp4_act_scale_flat,                                                          //
-        fp4_act_scale_flat,                                                          //
-        mQuantParams,                                                                //
-        original_num_tokens,                                                         //
-        expanded_num_tokens,                                                         //
-        mExpertHiddenSize,                                                           //
-        mExpertInterSize,                                                            //
-        num_experts_per_node,                                                        //
-        ActivationParams(mActivationType, swiglu_alpha, swiglu_beta, swiglu_limit),  //
-        alpha_scale_ptr_array,                                                       //
-        !mUseLora,                                                                   //
-        /*use_deepseek_fp8_block_scale=*/false,                                      //
-        stream,                                                                      //
-        tactic,                                                                      //
-        mMinLatencyMode,                                                             //
-        num_active_experts_per_node,                                                 //
-        active_expert_global_ids,                                                    //
-        enable_pdl);                                                                 //
+    mInterface->gemm1(input,                                             //
+                      output,                                            //
+                      intermediate,                                      //
+                      expert_first_token_offset,                         //
+                      tma_ws_input_template,                             //
+                      weights_sel,                                       //
+                      bias,                                              //
+                      expert_first_token_offset + num_experts_per_node,  //
+                      mQuantParams.wo.fc1_weight_scales,                 //
+                      mQuantParams.fp8.dequant_fc1,                      //
+                      mQuantParams.fp8_mxfp4.fc2.act_global_scale
+                          ? mQuantParams.fp8_mxfp4.fc2.act_global_scale
+                          : mQuantParams.fp8.quant_fc2,  //
+                      fp4_act_scale_flat,                //
+                      fp4_act_scale_flat,                //
+                      mQuantParams,                      //
+                      original_num_tokens,               //
+                      expanded_num_tokens,               //
+                      mExpertHiddenSize,                 //
+                      mExpertInterSize,                  //
+                      num_experts_per_node,              //
+                      ActivationParams(mActivationType, swiglu_alpha, swiglu_beta, swiglu_limit),
+                      alpha_scale_ptr_array,                   //
+                      !mUseLora,                               //
+                      /*use_deepseek_fp8_block_scale=*/false,  //
+                      stream,                                  //
+                      tactic,                                  //
+                      mMinLatencyMode,                         //
+                      num_active_experts_per_node,             //
+                      active_expert_global_ids,                //
+                      enable_pdl);                             //
   } else {
     TLLM_CHECK(mGemmToProfile == GemmToProfile::GEMM_2);
-    mInterface->gemm2(
-        input, intermediate, output, expert_first_token_offset, tma_ws_input_template, weights_sel,
-        bias, mQuantParams.wo.fc2_weight_scales, mQuantParams.fp8.dequant_fc2, fp4_act_scale_flat,
-        mQuantParams, token_topk_unpermuted_scales, token_topk_permuted_scales,
-        unpermuted_row_to_permuted_row, permuted_row_to_unpermuted_row, token_selected_experts,
-        expert_first_token_offset + mNumExpertsPerNode, original_num_tokens, expanded_num_tokens,
-        mExpertHiddenSize, mExpertInterSize, num_experts_per_node, mK, alpha_scale_ptr_array, false,
-        nullptr,
-        /*use_deepseek_fp8_block_scale=*/false, stream, mParallelismConfig, mEnableAlltoall, tactic,
-        mMinLatencyMode, num_active_experts_per_node, active_expert_global_ids, enable_pdl);
+    mInterface->gemm2(input,                                           //
+                      intermediate,                                    //
+                      output,                                          //
+                      expert_first_token_offset,                       //
+                      tma_ws_input_template,                           //
+                      weights_sel,                                     //
+                      bias,                                            //
+                      mQuantParams.wo.fc2_weight_scales,               //
+                      mQuantParams.fp8.dequant_fc2,                    //
+                      fp4_act_scale_flat,                              //
+                      mQuantParams,                                    //
+                      token_topk_unpermuted_scales,                    //
+                      token_topk_permuted_scales,                      //
+                      unpermuted_row_to_permuted_row,                  //
+                      permuted_row_to_unpermuted_row,                  //
+                      token_selected_experts,                          //
+                      expert_first_token_offset + mNumExpertsPerNode,  //
+                      original_num_tokens,                             //
+                      expanded_num_tokens,                             //
+                      mExpertHiddenSize,                               //
+                      mExpertUnpaddedHiddenSize,                       //
+                      mExpertInterSize,                                //
+                      num_experts_per_node,                            //
+                      mK,                                              //
+                      alpha_scale_ptr_array,                           //
+                      false,                                           //
+                      nullptr,                                         //
+                      /*use_deepseek_fp8_block_scale=*/false,          //
+                      stream,                                          //
+                      mParallelismConfig,                              //
+                      mEnableAlltoall,                                 //
+                      tactic,                                          //
+                      mMinLatencyMode,                                 //
+                      num_active_experts_per_node,                     //
+                      active_expert_global_ids,                        //
+                      enable_pdl);                                     //
   }
   mInterface->is_profiler = false;
 
diff --git a/csrc/fused_moe/cutlass_backend/deepgemm_jit_setup.cu b/csrc/fused_moe/cutlass_backend/deepgemm_jit_setup.cu
new file mode 100644
index 0000000000..0d2bd4d2cb
--- /dev/null
+++ b/csrc/fused_moe/cutlass_backend/deepgemm_jit_setup.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tvm/ffi/extra/module.h>
+
+#include <filesystem>
+
+#include "nv_internal/tensorrt_llm/deep_gemm/compiler.cuh"
+
+namespace flashinfer {
+
+void set_deepgemm_jit_include_dirs(tvm::ffi::Array<tvm::ffi::String> include_dirs) {
+  std::vector<std::filesystem::path> dirs;
+  for (const auto& dir : include_dirs) {
+    dirs.push_back(std::filesystem::path(std::string(dir)));
+  }
+  deep_gemm::jit::Compiler::setIncludeDirs(dirs);
+}
+
+}  // namespace flashinfer
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(set_deepgemm_jit_include_dirs,
+                              flashinfer::set_deepgemm_jit_include_dirs);
diff --git a/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_binding.cu b/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_binding.cu
similarity index 87%
rename from csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_binding.cu
rename to csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_binding.cu
index 5bef6f8719..2e341f0456 100644
--- a/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_binding.cu
+++ b/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_binding.cu
@@ -72,6 +72,8 @@ class DtypeUtils {
       default:
         TVM_FFI_ICHECK(false) << "unsupported data type";
     }
+
+    return nvinfer1::DataType::kFLOAT;  // supress compiler warning
   }
 
  private:
@@ -111,13 +113,16 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
         TVM_FFI_ICHECK(false) << "Invalid output type " << DLDataTypeToString(output_type)
                               << " specified for " << DLDataTypeToString(mActivationDtype);
     }
+
+    return nullptr;  // supress compiler warning
   };
 
   FusedMoeRunner(DLDataType activation_dtype, DLDataType weight_dtype, DLDataType output_dtype,
                  bool use_deepseek_fp8_block_scale, bool use_w4_group_scaling,
-                 bool use_mxfp8_act_scaling) {
+                 bool use_mxfp8_act_scaling, bool use_packed_weights) {
     mActivationDtype = activation_dtype;
     mWeightDtype = weight_dtype;
+    mUsePackedWeights = use_packed_weights;
     mOutputDtype = output_dtype;
     mUseDeepSeekFP8BlockScaling = use_deepseek_fp8_block_scale;
     mUseW4GroupScaling = use_w4_group_scaling;
@@ -219,7 +224,13 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
     }
 
     mProfiler = std::make_shared<kernels::GemmProfilerBackend>();
-    mAllProfiles = mKernelRunner->getTactics();
+    // Get tactics for both GEMM1 and GEMM2, combine them
+    auto gemm1_tactics = mKernelRunner->getTactics(kernels::MoeGemmId::GEMM_1);
+    auto gemm2_tactics = mKernelRunner->getTactics(kernels::MoeGemmId::GEMM_2);
+    mGemm1TacticCount = static_cast<int64_t>(gemm1_tactics.size());
+    mGemm2TacticCount = static_cast<int64_t>(gemm2_tactics.size());
+    mAllProfiles = gemm1_tactics;
+    mAllProfiles.insert(mAllProfiles.end(), gemm2_tactics.begin(), gemm2_tactics.end());
     TVM_FFI_ICHECK(!mAllProfiles.empty())
         << "No valid tactics available for fused moe op with the requested input combination "
            "Activation: "
@@ -361,16 +372,20 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
         num_rows, hidden_size, inter_size, num_experts_total, static_cast<int>(experts_per_token),
         base_activation_type, parallelism_config, min_latency_mode);
 
-    auto const quant_params =
-        getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales);
+    auto const quant_params = getQuantParams(num_experts_on_rank, hidden_size, inter_size,
+                                             quant_scales, base_activation_type);
     kernels::MoeMinLatencyParams min_latency_params{};
 
     // TODO: support lora in the future
     ::tensorrt_llm::kernels::LoraParams lora_params{};
+    // HACK Define default values for parameters we don't have good values for
+    bool const swizzled_input_sf = true;               // Assume input_sf is swizzled by default
+    int64_t const unpadded_hidden_size = hidden_size;  // Assume no padding by default
+    bool const use_lora = false;                       // No lora support yet
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
     mKernelRunner->runMoe(
         input.data_ptr(), input_sf.has_value() ? input_sf.value().data_ptr() : nullptr,
-        reinterpret_cast<int const*>(token_selected_experts.data_ptr()),
+        swizzled_input_sf, reinterpret_cast<int const*>(token_selected_experts.data_ptr()),
         token_final_scales.has_value()
             ? reinterpret_cast<float const*>(token_final_scales.value().data_ptr())
             : nullptr,
@@ -378,16 +393,16 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
         fc1_expert_biases.has_value() ? fc1_expert_biases.value().data_ptr() : nullptr,
         activation_params, fc2_expert_weights.data_ptr(),
         fc2_expert_biases.has_value() ? fc2_expert_biases.value().data_ptr() : nullptr,
-        quant_params, num_rows, hidden_size, inter_size, num_experts_total,
+        quant_params, num_rows, hidden_size, unpadded_hidden_size, inter_size, num_experts_total,
         static_cast<int>(experts_per_token),
         static_cast<char*>(workspace_info.workspace.data_ptr()), output.data_ptr(),
         static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config, enable_alltoall,
-        false, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
+        use_lora, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
         enable_pdl, stream);
 #else
     mKernelRunner->runMoe(
         input.data_ptr(), input_sf.has_value() ? input_sf.value().data_ptr() : nullptr,
-        reinterpret_cast<int const*>(token_selected_experts.data_ptr()),
+        swizzled_input_sf, reinterpret_cast<int const*>(token_selected_experts.data_ptr()),
         token_final_scales.has_value()
             ? reinterpret_cast<float const*>(token_final_scales.value().data_ptr())
             : nullptr,
@@ -396,10 +411,10 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
         activation_params, fc2_expert_weights.data_ptr(),
         fc2_expert_biases.has_value() ? fc2_expert_biases.value().data_ptr() : nullptr,
         quant_params, num_rows, hidden_size, inter_size, num_experts_total,
-        static_cast<int>(experts_per_token), static_cast<char*>(workspace_info.workspace),
-        output.data_ptr(), static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config,
-        false, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
-        enable_pdl, stream);
+        static_cast<int>(experts_per_token),
+        static_cast<char*>(workspace_info.workspace.data_ptr()), output.data_ptr(),
+        static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config, false, lora_params,
+        mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params, enable_pdl, stream);
 #endif
   }
 
@@ -542,15 +557,19 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
         num_rows, hidden_size, inter_size, num_experts_total, static_cast<int>(experts_per_token),
         base_activation_type, parallelism_config, min_latency_mode);
 
-    auto const quant_params =
-        getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales);
+    auto const quant_params = getQuantParams(num_experts_on_rank, hidden_size, inter_size,
+                                             quant_scales, base_activation_type);
 
     // TODO: support lora in the future
     ::tensorrt_llm::kernels::LoraParams lora_params{};
+    // HACK Define default values for parameters we don't have good values for
+    bool const swizzled_input_sf_ml = true;               // Assume input_sf is swizzled by default
+    int64_t const unpadded_hidden_size_ml = hidden_size;  // Assume no padding by default
+    bool const use_lora_ml = false;                       // No lora support yet
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
     mKernelRunner->runMoe(
         input.data_ptr(), input_sf.has_value() ? input_sf.value().data_ptr() : nullptr,
-        reinterpret_cast<int const*>(token_selected_experts.data_ptr()),
+        swizzled_input_sf_ml, reinterpret_cast<int const*>(token_selected_experts.data_ptr()),
         token_final_scales.has_value()
             ? reinterpret_cast<float const*>(token_final_scales.value().data_ptr())
             : nullptr,
@@ -558,16 +577,16 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
         fc1_expert_biases.has_value() ? fc1_expert_biases.value().data_ptr() : nullptr,
         activation_params, fc2_expert_weights.data_ptr(),
         fc2_expert_biases.has_value() ? fc2_expert_biases.value().data_ptr() : nullptr,
-        quant_params, num_rows, hidden_size, inter_size, num_experts_total,
+        quant_params, num_rows, hidden_size, unpadded_hidden_size_ml, inter_size, num_experts_total,
         static_cast<int>(experts_per_token),
         static_cast<char*>(workspace_info.workspace.data_ptr()), output.data_ptr(),
         static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config, enable_alltoall,
-        false, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
+        use_lora_ml, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
         enable_pdl, stream);
 #else
     mKernelRunner->runMoe(
         input.data_ptr(), input_sf.has_value() ? input_sf.value().data_ptr() : nullptr,
-        reinterpret_cast<int const*>(token_selected_experts.data_ptr()),
+        swizzled_input_sf_ml, reinterpret_cast<int const*>(token_selected_experts.data_ptr()),
         token_final_scales.has_value()
             ? reinterpret_cast<float const*>(token_final_scales.value().data_ptr())
             : nullptr,
@@ -575,11 +594,12 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
         fc1_expert_biases.has_value() ? fc1_expert_biases.value().data_ptr() : nullptr,
         activation_params, fc2_expert_weights.data_ptr(),
         fc2_expert_biases.has_value() ? fc2_expert_biases.value().data_ptr() : nullptr,
-        quant_params, num_rows, hidden_size, inter_size, num_experts_total,
-        static_cast<int>(experts_per_token), static_cast<char*>(workspace_info.workspace),
-        output.data_ptr(), static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config,
-        false, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
-        enable_pdl, stream);
+        quant_params, num_rows, hidden_size, unpadded_hidden_size_ml, inter_size, num_experts_total,
+        static_cast<int>(experts_per_token),
+        static_cast<char*>(workspace_info.workspace.data_ptr()), output.data_ptr(),
+        static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config, false, use_lora_ml,
+        lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params, enable_pdl,
+        stream);
 #endif
   }
 
@@ -641,19 +661,20 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
       auto activation_dtype =
           (mUseW4GroupScaling && !isWFP4A16Quant()) ? dl_float8_e4m3fn : mActivationDtype;
       activation_dtype = isNvfp4Quant() ? dl_int64 : activation_dtype;
+      int64_t const unpadded_hidden_size_profiler = hidden_size;  // HACK no padding by default
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
       mProfiler->init(*mKernelRunner.get(), mProfiler->mGemmToProfile,
                       DtypeUtils::dataType(activation_dtype), DtypeUtils::dataType(mWeightDtype),
                       DtypeUtils::dataType(mOutputDtype), num_experts, static_cast<int>(top_k),
-                      hidden_size, inter_size, group_size, activation_type, USE_BIAS, USE_LORA,
-                      min_latency_mode,
+                      hidden_size, unpadded_hidden_size_profiler, inter_size, group_size,
+                      activation_type, USE_BIAS, USE_LORA, min_latency_mode,
                       /*need_weights*/ false, parallelism_config, enable_alltoall);
 #else
       mProfiler->init(*mKernelRunner.get(), mProfiler->mGemmToProfile,
                       DtypeUtils::dataType(activation_dtype), DtypeUtils::dataType(mWeightDtype),
                       DtypeUtils::dataType(mOutputDtype), num_experts, static_cast<int>(top_k),
-                      hidden_size, inter_size, group_size, activation_type, USE_BIAS, USE_LORA,
-                      min_latency_mode,
+                      hidden_size, unpadded_hidden_size_profiler, inter_size, group_size,
+                      activation_type, USE_BIAS, USE_LORA, min_latency_mode,
                       /*need_weights*/ false, parallelism_config);
 #endif
 
@@ -691,6 +712,10 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
           });
     } else if (name == "get_tactic_num") {
       return Function::FromTyped([this]() -> int64_t { return getTacticNum(); });
+    } else if (name == "get_gemm1_tactic_count") {
+      return Function::FromTyped([this]() -> int64_t { return mGemm1TacticCount; });
+    } else if (name == "get_gemm2_tactic_count") {
+      return Function::FromTyped([this]() -> int64_t { return mGemm2TacticCount; });
     } else if (name == "run_moe") {
       return Function::FromTyped(
           [this](TensorView output, TensorView input, TensorView token_selected_experts,
@@ -755,9 +780,12 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
   bool mUseDeepSeekFP8BlockScaling = false;
   bool mUseW4GroupScaling = false;
   bool mUseMxfp8ActScaling = false;
+  bool mUsePackedWeights = false;
 
   using Profile = tensorrt_llm::cutlass_extensions::CutlassGemmConfig;
   std::vector<Profile> mAllProfiles;
+  int64_t mGemm1TacticCount{0};
+  int64_t mGemm2TacticCount{0};
 
   void setRunnerProfiles(Optional<Array<int64_t>> profile_ids) {
     if (mUseDeepSeekFP8BlockScaling) {
@@ -771,13 +799,34 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
     }
 
     auto best_gemm1_profile = mAllProfiles.front();
-    auto best_gemm2_profile = mAllProfiles.front();
+    // Default GEMM2 profile should come from the GEMM2 subrange if present
+    auto best_gemm2_profile =
+        (mGemm2TacticCount > 0 && mAllProfiles.size() > static_cast<size_t>(mGemm1TacticCount))
+            ? mAllProfiles.at(mGemm1TacticCount)
+            : mAllProfiles.front();
     if (profile_ids.has_value()) {
       TVM_FFI_ICHECK_EQ(profile_ids.value().size(), 2) << "Expecting 2 profile ids";
-      best_gemm1_profile = profile_ids.value()[0] == -1 ? best_gemm1_profile
-                                                        : mAllProfiles.at(profile_ids.value()[0]);
-      best_gemm2_profile = profile_ids.value()[1] == -1 ? best_gemm2_profile
-                                                        : mAllProfiles.at(profile_ids.value()[1]);
+      // GEMM1 index: accept absolute index; otherwise if clearly out of combined range, keep
+      // default
+      auto id1 = profile_ids.value()[0];
+      if (id1 != -1) {
+        TVM_FFI_ICHECK(id1 >= 0 && id1 < mGemm1TacticCount) << "Invalid gemm1 profile id: " << id1;
+        best_gemm1_profile = mAllProfiles.at(id1);
+      }
+
+      // GEMM2 index: support both absolute (combined) and relative (within GEMM2 subrange) ids
+      auto id2 = profile_ids.value()[1];
+      if (id2 != -1) {
+        int64_t absolute_id2 = id2;
+        // If id2 appears relative to GEMM2 subrange, offset it
+        if (id2 >= 0 && id2 < mGemm2TacticCount) {
+          absolute_id2 = mGemm1TacticCount + id2;
+        }
+        TVM_FFI_ICHECK(absolute_id2 >= 0 &&
+                       absolute_id2 < static_cast<int64_t>(mAllProfiles.size()))
+            << "Invalid gemm2 profile id: " << id2;
+        best_gemm2_profile = mAllProfiles.at(absolute_id2);
+      }
     }
     mKernelRunner->setTactic(best_gemm1_profile, best_gemm2_profile);
   }
@@ -809,9 +858,10 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
     return info;
   }
 
-  kernels::QuantParams getQuantParams(int64_t num_experts_on_rank, int64_t hidden_size,
-                                      int64_t inter_size,
-                                      Optional<Array<Tensor>> quant_scales) const {
+  kernels::QuantParams getQuantParams(
+      int64_t num_experts_on_rank, int64_t hidden_size, int64_t inter_size,
+      Optional<Array<Tensor>> quant_scales,
+      ActivationType base_activation_type = ActivationType::Swiglu) const {
     if (isFp8Quant()) {
       TVM_FFI_ICHECK(quant_scales.has_value()) << "Expecting quant scales for fp8 quantization";
       TVM_FFI_ICHECK_EQ(quant_scales.value().size(), 4)
@@ -1013,18 +1063,34 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
       // Check shapes
       TVM_FFI_ICHECK(fc1_act_global.ndim() == 0 || fc1_act_global.size(0) == num_experts_on_rank)
           << "fc1 act global must be scalar or (num_experts_on_rank,)";
-      TVM_FFI_ICHECK(
-          fc1_weight_block.size(0) == num_experts_on_rank &&
-          fc1_weight_block.size(1) ==
-              TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
-                  inter_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentNVFP4) *
-                  2 &&
-          fc1_weight_block.size(2) * FP8_PER_INT32 *
-                  TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize ==
-              TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
-                  hidden_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentNVFP4))
-          << "fc1 weight block size must be (num_experts_on_rank, inter_size * 2, hidden_size // 4 "
-             "// block_scale_vector_size)";
+      if (isGatedActivation(base_activation_type)) {
+        TVM_FFI_ICHECK(
+            fc1_weight_block.size(0) == num_experts_on_rank &&
+            fc1_weight_block.size(1) ==
+                TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
+                    inter_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentNVFP4) *
+                    2 &&
+            fc1_weight_block.size(2) * FP8_PER_INT32 *
+                    TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize ==
+                TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
+                    hidden_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentNVFP4))
+            << "fc1 weight block size must be (num_experts_on_rank, inter_size * 2, hidden_size // "
+               "4 "
+               "// block_scale_vector_size)";
+      } else {
+        TVM_FFI_ICHECK(
+            fc1_weight_block.size(0) == num_experts_on_rank &&
+            fc1_weight_block.size(1) ==
+                TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
+                    inter_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentNVFP4) &&
+            fc1_weight_block.size(2) * FP8_PER_INT32 *
+                    TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize ==
+                TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
+                    hidden_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentNVFP4))
+            << "fc1 weight block size must be (num_experts_on_rank, inter_size, hidden_size // 4 "
+               "// block_scale_vector_size)";
+      }
+
       TVM_FFI_ICHECK_EQ(fc1_global.size(0), num_experts_on_rank)
           << "fc1 global size must be (num_experts_on_rank,)";
       TVM_FFI_ICHECK(fc2_act_global.ndim() == 0 || fc2_act_global.size(0) == num_experts_on_rank)
@@ -1110,9 +1176,11 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
            mActivationDtype != dl_float8_e4m3fn;  // FP8 activation does not use FP4
   }
 
-  bool isWFP4A16Quant() const { return mUseW4GroupScaling && mWeightDtype == dl_uint8; }
+  bool isWFP4A16Quant() const {
+    return mUseW4GroupScaling && mWeightDtype == dl_uint8 && !mUsePackedWeights;
+  }
 
-  bool isInt4Quant() const { return mWeightDtype == dl_uint4x2; }
+  bool isInt4Quant() const { return mWeightDtype == dl_uint8 && mUsePackedWeights; }
 
   bool isW4AFp8Quant() const { return mActivationDtype == dl_float8_e4m3fn && isInt4Quant(); }
 
@@ -1127,10 +1195,10 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
 
 tvm::ffi::Module init(DLDataType activation_dtype, DLDataType weight_dtype, DLDataType output_dtype,
                       bool use_deepseek_fp8_block_scale, bool use_w4_group_scaling,
-                      bool use_mxfp8_act_scaling) {
-  auto ptr = tvm::ffi::make_object<FusedMoeRunner>(activation_dtype, weight_dtype, output_dtype,
-                                                   use_deepseek_fp8_block_scale,
-                                                   use_w4_group_scaling, use_mxfp8_act_scaling);
+                      bool use_mxfp8_act_scaling, bool use_packed_weights) {
+  auto ptr = tvm::ffi::make_object<FusedMoeRunner>(
+      activation_dtype, weight_dtype, output_dtype, use_deepseek_fp8_block_scale,
+      use_w4_group_scaling, use_mxfp8_act_scaling, use_packed_weights);
   return tvm::ffi::Module(ptr);
 }
 
diff --git a/csrc/fused_moe/moeTopKFuncs.cuh b/csrc/fused_moe/moeTopKFuncs.cuh
new file mode 100644
index 0000000000..e34c5f2665
--- /dev/null
+++ b/csrc/fused_moe/moeTopKFuncs.cuh
@@ -0,0 +1,254 @@
+
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#ifndef TRTLLM_MOETOPKFUNCS_CUH_H
+#define TRTLLM_MOETOPKFUNCS_CUH_H
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+
+#include <cub/cub.cuh>
+
+#include "flashinfer/arch_condition.h"
+
+namespace tensorrt_llm::kernels {
+
+namespace reduce_topk {
+namespace cg = cooperative_groups;
+static constexpr int kWARP_SIZE = 32;
+static constexpr bool kTLLM_GEN_HAS_FAST_REDUX = flashinfer::arch::is_major_v<10>;
+
+template <typename T_>
+struct TopKRedType {
+  using T = T_;
+  static_assert(std::is_same_v<T, float> || std::is_same_v<T, half> ||
+                    std::is_same_v<T, __nv_bfloat16> || std::is_same_v<T, int>,
+                "Top K reduction only implemented for int, float, float16 and bfloat16");
+
+  using TypeCmp = std::conditional_t<sizeof(T) == 4, uint64_t, uint32_t>;
+  using IdxT = std::conditional_t<sizeof(T) == 4, int32_t, int16_t>;
+
+  static constexpr int kMoveBits = (sizeof(T) == 4) ? 32 : 16;
+  static constexpr int kMaxIdx = 65535;
+  TypeCmp compValIdx;
+
+  static __host__ __device__ inline TypeCmp makeCmpVal(T val, int32_t idx = 0) {
+    auto valueBits =
+        cub::Traits<T>::TwiddleIn(reinterpret_cast<typename cub::Traits<T>::UnsignedBits&>(val));
+    TypeCmp compactTmp = valueBits;
+    compactTmp = (compactTmp << kMoveBits) | (0xFFFF & (kMaxIdx - idx));
+    // Use 65535 minus idx to give higher priority to elements with smaller indices.
+    return compactTmp;
+  }
+
+  static __host__ __device__ void unpack(T& value, int32_t& index, TypeCmp cmp) {
+    // Since “65535-idx” is always smaller than 65536 and positive, we can directly use it as the
+    // lower 16 bits
+    index = kMaxIdx - static_cast<int32_t>((cmp & 0xFFFF));
+
+    auto compactTmp = cmp >> kMoveBits;
+    auto valueBits = cub::Traits<T>::TwiddleOut(
+        reinterpret_cast<typename cub::Traits<T>::UnsignedBits&>(compactTmp));
+    value = reinterpret_cast<T&>(valueBits);
+  }
+
+  __host__ __device__ TopKRedType() = default;
+
+  __host__ __device__ TopKRedType(T val, int32_t idx) : compValIdx(makeCmpVal(val, idx)) {}
+
+  __host__ __device__ operator TypeCmp() const noexcept { return compValIdx; }
+
+  __device__ inline TypeCmp reduce(cg::thread_block_tile<kWARP_SIZE> const& warp) {
+    if constexpr (!kTLLM_GEN_HAS_FAST_REDUX || sizeof(TypeCmp) == 8) {
+      return cg::reduce(warp, compValIdx, cg::greater<TypeCmp>{});
+    } else {
+      TypeCmp result;
+      asm("redux.sync.max.u32 %0, %1, 0xffffffff;\n" : "=r"(result) : "r"(compValIdx));
+      return result;
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int K_, bool Enable_>
+struct TopKIdx {
+  // by default, empty
+};
+
+template <int K_>
+struct TopKIdx<K_, true> {
+  static constexpr int K = K_;
+  int32_t val[K];
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define TOPK_SWAP(I, J)                                         \
+  {                                                             \
+    auto pairMin = min(topK[I].compValIdx, topK[J].compValIdx); \
+    auto pairMax = max(topK[I].compValIdx, topK[J].compValIdx); \
+    topK[I].compValIdx = pairMax;                               \
+    topK[J].compValIdx = pairMin;                               \
+  }
+
+template <int N, typename RedType>
+struct Sort;
+
+template <typename RedType>
+struct Sort<1, RedType> {
+  static __device__ void run(RedType* topK) {}
+};
+
+template <typename RedType>
+struct Sort<2, RedType> {
+  static __device__ void run(RedType* topK) { TOPK_SWAP(0, 1); }
+};
+
+template <typename RedType>
+struct Sort<3, RedType> {
+  static __device__ void run(RedType* topK) {
+    TOPK_SWAP(0, 1);
+    TOPK_SWAP(1, 2);
+    TOPK_SWAP(0, 1);
+  }
+};
+
+template <typename RedType>
+struct Sort<4, RedType> {
+  static __device__ void run(RedType* topK) {
+    TOPK_SWAP(0, 2);
+    TOPK_SWAP(1, 3);
+    TOPK_SWAP(0, 1);
+    TOPK_SWAP(2, 3);
+    TOPK_SWAP(1, 2);
+  }
+};
+
+template <int K, typename Type>
+__forceinline__ __device__ void reduceTopK(cg::thread_block_tile<kWARP_SIZE> const& warp,
+                                           Type (&out)[K], int32_t (&outIdx)[K], Type value,
+                                           int32_t idx, Type const minValue, int actualK = K) {
+  static_assert(K > 0, "Top K must have K > 0");
+  static_assert(K < kWARP_SIZE, "Top K must have K < kWARP_SIZE");
+  using RedType = TopKRedType<Type>;
+  RedType topK{value, idx};
+  typename RedType::TypeCmp packedMax{};
+#pragma unroll
+  for (int kk = 0; kk < actualK; ++kk)  //@todo: check if actualK is correct
+  {
+    topK = kk > 0 && packedMax == topK.compValIdx ? RedType{minValue, idx} : topK;
+    // get the next largest value
+    packedMax = topK.reduce(warp);
+    RedType::unpack(out[kk], outIdx[kk], packedMax);
+  }
+};
+
+template <int K, typename Type, int N, bool IsSorted = false>
+__device__ void reduceTopKFunc(cg::thread_block_tile<kWARP_SIZE> const& warp, Type (&out)[K],
+                               int32_t (&outIdx)[K], Type (&value)[N], int32_t (&idx)[N],
+                               Type minValue, int actualK = K) {
+  static_assert(K > 0, "Top K must have K > 0");
+  static_assert(K < kWARP_SIZE, "Top K must have K < kWARP_SIZE");
+  static_assert(N > 0, "Top K must have N > 0");
+  static_assert(N < 5, "Only support candidates number less than or equal to 128");
+  using RedType = TopKRedType<Type>;
+  RedType topK[N];
+#pragma unroll
+  for (int nn = 0; nn < N; ++nn) {
+    topK[nn] = RedType{value[nn], idx[nn]};
+  }
+
+  if constexpr (!IsSorted) {
+    Sort<N, RedType>::run(topK);
+  }
+  typename RedType::TypeCmp packedMax{};
+#pragma unroll
+  for (int kk = 0; kk < actualK; ++kk) {
+    bool update = kk > 0 && packedMax == topK[0].compValIdx;
+#pragma unroll
+    for (int nn = 0; nn < N; ++nn) {
+      topK[nn] = update && nn == N - 1 ? RedType{minValue, idx[nn]}
+                 : update              ? topK[nn + 1]
+                                       : topK[nn];
+    }
+    // get the next largest value
+    packedMax = topK[0].reduce(warp);
+    RedType::unpack(out[kk], outIdx[kk], packedMax);
+  }
+};
+
+template <int K, typename Type, int N>
+__forceinline__ __device__ void reduceTopK(cg::thread_block_tile<kWARP_SIZE> const& warp,
+                                           Type (&out)[K], int32_t (&outIdx)[K], Type (&value)[N],
+                                           int32_t (&idx)[N], Type const minValue,
+                                           int actualK = K) {
+  static_assert(K > 0, "Top K must have K > 0");
+  static_assert(K < kWARP_SIZE, "Top K must have K < kWARP_SIZE");
+  static_assert(N > 0, "Top K must have N > 0");
+  static_assert(N <= 16, "Only support candidates number less than or equal to 16*32=512");
+  static_assert(
+      N <= 4 || N % 4 == 0,
+      "Only support candidates number is a multiple of 4*32=128 or less than or equal to 4");
+  using RedType = TopKRedType<Type>;
+
+  if constexpr (N <= 4) {
+    reduceTopKFunc<K, Type, N>(warp, out, outIdx, value, idx, minValue, actualK);
+  } else {
+    constexpr int numLoops = N / 4;
+    constexpr int numResults = (numLoops * K - 1) / kWARP_SIZE + 1;
+
+    Type topKBufferValue[numResults];
+    int32_t topKBufferIdx[numResults];
+    int32_t laneIdx = threadIdx.x % kWARP_SIZE;
+
+    for (int ii = 0; ii < numResults; ++ii) {
+      topKBufferValue[ii] = minValue;
+      topKBufferIdx[ii] = ii * kWARP_SIZE - 1;  //@todo: check if this is correct
+    }
+    for (int loop = 0; loop < numLoops; ++loop) {
+      int start = loop * 4;
+      Type topKValue[K];
+      int32_t topKIdx[K];
+      Type inValue[4];
+      int32_t inIdx[4];
+      for (int i = 0; i < 4; ++i) {
+        inValue[i] = value[start + i];
+        inIdx[i] = idx[start + i];
+      }
+      reduceTopKFunc<K, Type, 4>(warp, topKValue, topKIdx, inValue, inIdx, minValue, actualK);
+      int inOffset = laneIdx % K;
+      if (laneIdx >= loop * K && laneIdx < (loop + 1) * K) {
+        topKBufferValue[0] = topKValue[inOffset];
+        topKBufferIdx[0] = topKIdx[inOffset];
+      }
+      if (loop == numLoops - 1 && (laneIdx < (numLoops * K - kWARP_SIZE))) {
+        topKBufferValue[1] = topKValue[inOffset];
+        topKBufferIdx[1] = topKIdx[inOffset];
+      }
+    }
+
+    reduceTopKFunc<K, Type, numResults>(warp, out, outIdx, topKBufferValue, topKBufferIdx, minValue,
+                                        actualK);
+  }
+};
+
+#undef TOPK_SWAP
+
+}  // namespace reduce_topk
+}  // namespace tensorrt_llm::kernels
+#endif  // TRTLLM_MOETOPKFUNCS_CUH_H
diff --git a/csrc/fused_moe/noAuxTcKernels.cu b/csrc/fused_moe/noAuxTcKernels.cu
new file mode 100644
index 0000000000..1f57d9b57b
--- /dev/null
+++ b/csrc/fused_moe/noAuxTcKernels.cu
@@ -0,0 +1,450 @@
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+
+#include <cmath>
+
+#include "flashinfer/trtllm/fused_moe/noAuxTcKernels.h"
+#include "moeTopKFuncs.cuh"
+#include "tensorrt_llm/common/cudaTypeUtils.cuh"
+#include "tensorrt_llm/common/envUtils.h"
+#include "tvm_ffi_utils.h"
+
+namespace cg = cooperative_groups;
+using namespace tensorrt_llm::common;
+
+namespace tensorrt_llm::kernels {
+static constexpr int WARP_SIZE = 32;
+static constexpr int NumKimiK2Experts = 384;
+static constexpr int NumDeepseekExperts = 256;
+static constexpr int MaxNumExpertsUnit = 128;
+static constexpr int NumTopGroupScores = 2;
+static constexpr int MaxNumTopExperts = 8;
+static constexpr int MaxNumTopGroups = 4;
+
+static __device__ inline float sigmoid_accurate(float x) { return 0.5f * tanhf(0.5f * x) + 0.5f; }
+
+template <typename InputT, typename BiasT, typename OutputT, typename IdxT, int MaxNumExperts,
+          bool UseGroups>
+__global__ void deepseek_v3_topk_kernel(InputT* scores, OutputT* topkValues, IdxT* topkIndices,
+                                        BiasT* routingBias, int64_t const numTokens,
+                                        int64_t const numGroup, int64_t const topkGroup,
+                                        int64_t const topk, int64_t const numExperts,
+                                        int64_t const numExpertsPerGroup,
+                                        double const routedScalingFactor) {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.wait;");
+#endif
+
+  // declare shared memory structure
+  // number of experts is bounded by number of threads
+  __shared__ float __attribute((aligned(128))) smemScoreSigmoid[MaxNumExperts];
+  __shared__ float __attribute((aligned(128))) smemScoreBias[MaxNumExperts];
+  // number of expert groups is bounded by number of warps
+  int constexpr NumWarps = MaxNumExperts / WARP_SIZE;
+  __shared__ float __attribute((aligned(128))) smemGroupScores[NumWarps];
+
+  // needed for warp reduce
+  auto block = cg::this_thread_block();
+  auto warp = cg::tiled_partition<WARP_SIZE>(block);
+
+  // for the final reduction of weight norm, only some lanes need to participate
+  int32_t laneIdx = threadIdx.x % WARP_SIZE;
+  int32_t warpIdx = __shfl_sync(0xffffffff, threadIdx.x / WARP_SIZE, 0);
+
+  if constexpr (UseGroups) {
+    if (warpIdx >= numGroup) {
+      return;
+    }
+  }
+
+  // note that for invalid scores, we simply use a negative value:
+  // they work well even with the compacted format used in topK, and
+  // sigmoid / bias activated scores cannot be negative
+  static constexpr float invalidScoreFloat = float{-INFINITY};
+  const OutputT invalidScore = OutputT{invalidScoreFloat};
+
+  // load bias already; each warp represents one expert group
+  auto threadExpert = threadIdx.x;
+  bool expertSelected = threadExpert < numExperts;
+  if constexpr (UseGroups) {
+    threadExpert = warpIdx * numExpertsPerGroup + laneIdx;
+    expertSelected = laneIdx < numExpertsPerGroup;
+  }
+
+  auto scoreIdx = int64_t{blockIdx.x} * int64_t{numExperts} + threadExpert;
+  auto biasVal = expertSelected ? static_cast<float>(routingBias[threadExpert]) : invalidScoreFloat;
+  topkValues += blockIdx.x * topk;
+  topkIndices += blockIdx.x * topk;
+
+  // get our assigned thread score; each warp represents one expert group
+  float score = expertSelected ? static_cast<float>(scores[scoreIdx]) : invalidScoreFloat;
+  auto scoreSigmoid = sigmoid_accurate(score);
+  // write the sigmoid score to shared for later use
+  if (expertSelected) {
+    smemScoreSigmoid[threadExpert] = scoreSigmoid;
+  }
+
+  // get the score with bias
+  // note that with invalid values, because sigmoid is < 1 and bias is -1,
+  // we must get a negative value, which is smaller than any valid value
+  auto scoreBias = float{scoreSigmoid + float{biasVal}};
+
+  if (expertSelected) {
+    smemScoreBias[threadExpert] = scoreBias;
+  }
+
+  // registers for top group score reduction
+  float topExpGroupScores[NumTopGroupScores];
+  [[maybe_unused]] int32_t topExpGroupIdx[NumTopGroupScores];
+  float topGroups[MaxNumTopGroups];  // bound of numGroup
+  int32_t topGroupIdx[MaxNumTopGroups];
+  float expertScoreGroup[MaxNumTopGroups];
+  int32_t expertIdxGroup[MaxNumTopGroups];
+  float topScores[MaxNumTopExperts];  // bound of topk
+  int32_t topExperts[MaxNumTopExperts];
+
+  if constexpr (UseGroups) {
+    reduce_topk::reduceTopK(warp, topExpGroupScores, topExpGroupIdx, scoreBias, threadExpert,
+                            /* minValue */ invalidScoreFloat);
+
+    // get the final group score and write it to shared
+    if (laneIdx == 0) {
+      auto groupScore = topExpGroupScores[0] + topExpGroupScores[1];
+      smemGroupScores[warpIdx] = groupScore;
+    }
+  }
+
+  // make group scores available to all warps
+  __syncthreads();
+
+  if constexpr (UseGroups) {
+    if (warpIdx == 0) {
+      // a single warp performs the selection of top groups, and goes on to select the final experts
+      float groupScore = laneIdx < numGroup ? smemGroupScores[laneIdx] : invalidScoreFloat;
+
+      reduce_topk::reduceTopK(warp, topGroups, topGroupIdx, groupScore, laneIdx,
+                              /* minValue */ invalidScoreFloat);
+
+      // final expert selection: get relevant indexes and scores from shared
+
+#pragma unroll
+      for (int ii = 0; ii < MaxNumTopGroups; ++ii) {  // bound of numGroup
+        // auto groupIdx = topGroupIdx[ii];
+        auto groupIdx = (ii < topkGroup) ? topGroupIdx[ii] : 0;
+        expertIdxGroup[ii] = groupIdx * numExpertsPerGroup + laneIdx;
+
+        expertScoreGroup[ii] = (ii < topkGroup) && expertSelected
+                                   ? smemScoreBias[expertIdxGroup[ii]]
+                                   : invalidScoreFloat;
+      }
+
+      tensorrt_llm::kernels::reduce_topk::reduceTopK(warp, topScores, topExperts, expertScoreGroup,
+                                                     expertIdxGroup,
+                                                     /* minValue */ invalidScoreFloat, topk);
+    }
+  } else if constexpr (MaxNumExperts > MaxNumExpertsUnit) {
+    // without groups, and the expert number is larger than MaxNumExpertsUnit,
+    // we need to use multiple warps to calculate the intermediate topk results
+
+    int constexpr NumExpertWarps = (MaxNumExperts - 1) / MaxNumExpertsUnit + 1;
+    int constexpr NumInterTopK = NumExpertWarps * MaxNumTopExperts;
+    __shared__ float __attribute((aligned(128))) smemInterTopScores[NumInterTopK];
+    __shared__ int32_t __attribute((aligned(128))) smemInterTopExperts[NumInterTopK];
+    if (warpIdx < NumExpertWarps) {
+      int offset = warpIdx * WARP_SIZE * MaxNumTopGroups;
+#pragma unroll
+      for (int ii = 0; ii < MaxNumTopGroups; ++ii) {
+        auto expertIdx = ii * WARP_SIZE + laneIdx;
+        expertIdxGroup[ii] = offset + expertIdx;
+        expertScoreGroup[ii] =
+            offset + expertIdx < numExperts ? smemScoreBias[offset + expertIdx] : invalidScoreFloat;
+      }
+      reduce_topk::reduceTopK(warp, topScores, topExperts, expertScoreGroup, expertIdxGroup,
+                              /* minValue */ invalidScoreFloat, topk);
+
+      if (laneIdx < topk) {
+        smemInterTopScores[warpIdx * MaxNumTopExperts + laneIdx] = topScores[laneIdx];
+        smemInterTopExperts[warpIdx * MaxNumTopExperts + laneIdx] = topExperts[laneIdx];
+      }
+    }
+    __syncthreads();
+    if (warpIdx == 0) {
+      int constexpr NumInterTopKPerThread = (NumInterTopK * NumExpertWarps - 1) / WARP_SIZE + 1;
+      float intermidiateScore[NumInterTopKPerThread];
+      int32_t intermidiateExpert[NumInterTopKPerThread];
+      for (int i = laneIdx; i < NumInterTopKPerThread * WARP_SIZE; i += WARP_SIZE) {
+        int ii = i / WARP_SIZE;
+        if (i < NumInterTopK) {
+          intermidiateScore[ii] = smemInterTopScores[i];
+          intermidiateExpert[ii] = smemInterTopExperts[i];
+        } else {
+          intermidiateScore[ii] = invalidScoreFloat;
+          intermidiateExpert[ii] = MaxNumExperts - 1;
+        }
+      }
+      reduce_topk::reduceTopK(warp, topScores, topExperts, intermidiateScore, intermidiateExpert,
+                              /* minValue */ invalidScoreFloat, topk);
+    }
+  } else {
+    // without groups, and the expert number is smaller than MaxNumExpertsUnit
+    // each thread just takes `MaxNumTopGroups` experts
+    if (warpIdx == 0) {
+#pragma unroll
+      for (int ii = 0; ii < MaxNumTopGroups; ++ii) {
+        auto expertIdx = ii * WARP_SIZE + laneIdx;
+        expertIdxGroup[ii] = expertIdx;
+        expertScoreGroup[ii] =
+            expertIdx < numExperts ? smemScoreBias[expertIdx] : invalidScoreFloat;
+      }
+      reduce_topk::reduceTopK(warp, topScores, topExperts, expertScoreGroup, expertIdxGroup,
+                              /* minValue */ invalidScoreFloat, topk);
+    }
+  }
+
+  if (warpIdx == 0) {
+    // determine our lane's expert index and write to output
+    int32_t expertIdx = laneIdx < topk ? topExperts[laneIdx] : MaxNumExperts - 1;
+    // norm the value
+    float scoreNorm = laneIdx < topk ? smemScoreSigmoid[expertIdx] : 0.F;
+    auto redNorm = cg::reduce(warp, scoreNorm, cg::plus<float>{});
+    auto finalScore = static_cast<OutputT>(scoreNorm * routedScalingFactor / (redNorm + 1e-20));
+    // store the topk scores and experts to output
+    if (laneIdx < topk) {
+      topkValues[laneIdx] = static_cast<OutputT>(finalScore);
+      topkIndices[laneIdx] = expertIdx;
+    }
+  }
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+template <typename InputT, typename BiasT, typename OutputT, typename IdxT>
+void invokeNoAuxTc(InputT* scores, BiasT* bias, OutputT* topk_values, IdxT* topk_indices,
+                   int64_t const num_tokens, int64_t const num_experts, int64_t const n_group,
+                   int64_t const topk_group, int64_t const topk, double const routed_scaling_factor,
+                   bool const launch_with_pdl, cudaStream_t const stream) {
+  // Check if we can use the optimized deepseek_v3_topk_kernel
+  bool const is_single_group = (n_group == 1) && (num_experts <= NumKimiK2Experts);
+
+  int64_t const experts_per_group = num_experts / n_group;
+  bool const is_multi_group = (n_group != 1) && (num_experts <= NumDeepseekExperts) &&
+                              (experts_per_group <= WARP_SIZE) &&
+                              (experts_per_group * topk_group <= MaxNumExpertsUnit);
+
+  if (is_single_group || is_multi_group) {
+    cudaLaunchConfig_t config;
+    auto* kernel_instance =
+        &deepseek_v3_topk_kernel<InputT, BiasT, OutputT, IdxT, NumDeepseekExperts, true>;
+    int num_threads = NumDeepseekExperts;
+    if (is_single_group) {
+      if (num_experts > MaxNumExpertsUnit) {
+        kernel_instance =
+            &deepseek_v3_topk_kernel<InputT, BiasT, OutputT, IdxT, NumKimiK2Experts, false>;
+        num_threads = NumKimiK2Experts;
+      } else {
+        kernel_instance =
+            &deepseek_v3_topk_kernel<InputT, BiasT, OutputT, IdxT, MaxNumExpertsUnit, false>;
+        num_threads = MaxNumExpertsUnit;
+      }
+    }
+
+    config.gridDim = num_tokens;
+    config.blockDim = num_threads;
+    config.dynamicSmemBytes = 0;
+    config.stream = stream;
+    cudaLaunchAttribute attrs[1];
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    attrs[0].val.programmaticStreamSerializationAllowed = launch_with_pdl;
+    config.numAttrs = 1;
+    config.attrs = attrs;
+
+    cudaLaunchKernelEx(&config, kernel_instance, scores, topk_values, topk_indices, bias,
+                       num_tokens, n_group, topk_group, topk, num_experts, num_experts / n_group,
+                       routed_scaling_factor);
+    sync_check_cuda_error(stream);
+  } else {
+    // TODO: call the generic path (previous implementation) or signal unsupported config.
+    TLLM_CHECK_WITH_INFO(false,
+                         "invokeNoAuxTc: unsupported configuration (n_group=%ld, num_experts=%ld, "
+                         "topk_group=%ld). Please use "
+                         "original pytorch implementation.",
+                         n_group, num_experts, topk_group);
+  }
+}
+
+#define INSTANTIATE_NOAUX_TC(InputT, BiasT, OutputT, IdxT)                              \
+  template void invokeNoAuxTc<InputT, BiasT, OutputT, IdxT>(                            \
+      InputT * scores, BiasT * bias, OutputT * topk_values, IdxT * topk_indices,        \
+      int64_t const num_tokens, int64_t const num_experts, int64_t const n_group,       \
+      int64_t const topk_group, int64_t const topk, double const routed_scaling_factor, \
+      bool const launch_with_pdl, cudaStream_t const stream);
+
+INSTANTIATE_NOAUX_TC(float, float, float, int32_t);
+INSTANTIATE_NOAUX_TC(float, half, float, int32_t);
+
+INSTANTIATE_NOAUX_TC(half, float, half, int32_t);
+INSTANTIATE_NOAUX_TC(half, half, half, int32_t);
+
+#ifdef ENABLE_BF16
+INSTANTIATE_NOAUX_TC(float, __nv_bfloat16, float, int32_t);
+INSTANTIATE_NOAUX_TC(half, __nv_bfloat16, half, int32_t);
+
+INSTANTIATE_NOAUX_TC(__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, int32_t);
+INSTANTIATE_NOAUX_TC(__nv_bfloat16, float, __nv_bfloat16, int32_t);
+INSTANTIATE_NOAUX_TC(__nv_bfloat16, half, __nv_bfloat16, int32_t);
+#endif
+
+}  // namespace tensorrt_llm::kernels
+
+namespace flashinfer::trtllm_dsv3_fused_routing {
+// th::Tensor const& scores, th::Tensor const& bias, int64_t n_group,
+// int64_t topk_group, int64_t topk, double routed_scaling_factor
+// th::Tensor topk_values, th::Tensor topk_indices
+
+void NoAuxTc(TensorView scores, TensorView bias, int64_t n_group, int64_t topk_group, int64_t topk,
+             double routed_scaling_factor, TensorView topk_values, TensorView topk_indices,
+             bool launch_with_pdl) {
+  auto data_type = scores.dtype();
+  auto bias_type = bias.dtype();
+
+  auto input_size = scores.sizes();
+  int64_t num_tokens = input_size[0];
+  int64_t num_experts = input_size[1];
+
+  TVM_FFI_ICHECK(input_size.size() == 2) << "scores must be a 2D Tensor";
+  TVM_FFI_ICHECK((scores.device().device_type == kDLCUDA) && (bias.device().device_type == kDLCUDA))
+      << "scores and bias must be CUDA tensors";
+  TVM_FFI_ICHECK(scores.device().device_id == bias.device().device_id)
+      << "scores and bias must be on the same device";
+  TVM_FFI_ICHECK(bias.dim() == 1 && bias.numel() == num_experts)
+      << "bias must be 1D with length == number of experts (%ld)";
+  TVM_FFI_ICHECK(num_experts % n_group == 0) << "num_experts should be divisible by n_group";
+  TVM_FFI_ICHECK(n_group <= 32)
+      << "n_group should be smaller than or equal to 32 for now";  //@todo: remove this restriction
+                                                                   // later
+  TVM_FFI_ICHECK(topk <= 32)
+      << "topk should be smaller than or equal to 32 for now";  //@todo: remove this restriction
+                                                                // later
+  TVM_FFI_ICHECK(topk_values.dim() == 2) << "topk_values must be a 2D Tensor";
+  TVM_FFI_ICHECK(topk_indices.dim() == 2) << "topk_indices must be a 2D Tensor";
+  TVM_FFI_ICHECK(topk_values.sizes()[0] == num_tokens)
+      << "topk_values must have the same number of tokens as scores";
+  TVM_FFI_ICHECK(topk_indices.sizes()[0] == num_tokens)
+      << "topk_indices must have the same number of tokens as scores";
+  TVM_FFI_ICHECK(topk_values.sizes()[1] == topk)
+      << "topk_values must have the same number of topk as scores";
+  TVM_FFI_ICHECK(topk_indices.sizes()[1] == topk)
+      << "topk_indices must have the same number of topk as scores";
+  TVM_FFI_ICHECK(topk_values.dtype() == data_type)
+      << "topk_values must have the same dtype as scores";
+  TVM_FFI_ICHECK(encode_dlpack_dtype(topk_indices.dtype()) == int32_code)
+      << "topk_indices must have the same dtype as scores";
+
+  auto stream = get_stream(scores.device());
+  using namespace tensorrt_llm::kernels;
+  switch (encode_dlpack_dtype(data_type)) {
+    case float16_code:
+      // Handle Float16
+      switch (encode_dlpack_dtype(bias_type)) {
+        case float16_code:
+          invokeNoAuxTc<half, half, half, int32_t>(
+              reinterpret_cast<half*>(scores.data_ptr()), reinterpret_cast<half*>(bias.data_ptr()),
+              reinterpret_cast<half*>(topk_values.data_ptr()),
+              reinterpret_cast<int32_t*>(topk_indices.data_ptr()), num_tokens, num_experts, n_group,
+              topk_group, topk, routed_scaling_factor, launch_with_pdl, stream);
+          break;
+        case float32_code:
+          invokeNoAuxTc<half, float, half, int32_t>(
+              reinterpret_cast<half*>(scores.data_ptr()), reinterpret_cast<float*>(bias.data_ptr()),
+              reinterpret_cast<half*>(topk_values.data_ptr()),
+              reinterpret_cast<int32_t*>(topk_indices.data_ptr()), num_tokens, num_experts, n_group,
+              topk_group, topk, routed_scaling_factor, launch_with_pdl, stream);
+          break;
+        case bfloat16_code:
+          invokeNoAuxTc<half, __nv_bfloat16, half, int32_t>(
+              reinterpret_cast<half*>(scores.data_ptr()),
+              reinterpret_cast<__nv_bfloat16*>(bias.data_ptr()),
+              reinterpret_cast<half*>(topk_values.data_ptr()),
+              reinterpret_cast<int32_t*>(topk_indices.data_ptr()), num_tokens, num_experts, n_group,
+              topk_group, topk, routed_scaling_factor, launch_with_pdl, stream);
+          break;
+        default:
+          throw std::invalid_argument(
+              "Invalid bias dtype, only supports float16, float32, and bfloat16");
+          break;
+      }
+      break;
+    case float32_code:
+      switch (encode_dlpack_dtype(bias_type)) {
+        case float32_code:
+          invokeNoAuxTc<float, float, float, int32_t>(
+              reinterpret_cast<float*>(scores.data_ptr()),
+              reinterpret_cast<float*>(bias.data_ptr()),
+              reinterpret_cast<float*>(topk_values.data_ptr()),
+              reinterpret_cast<int32_t*>(topk_indices.data_ptr()), num_tokens, num_experts, n_group,
+              topk_group, topk, routed_scaling_factor, launch_with_pdl, stream);
+          break;
+        case float16_code:
+          invokeNoAuxTc<float, half, float, int32_t>(
+              reinterpret_cast<float*>(scores.data_ptr()), reinterpret_cast<half*>(bias.data_ptr()),
+              reinterpret_cast<float*>(topk_values.data_ptr()),
+              reinterpret_cast<int32_t*>(topk_indices.data_ptr()), num_tokens, num_experts, n_group,
+              topk_group, topk, routed_scaling_factor, launch_with_pdl, stream);
+          break;
+        case bfloat16_code:
+          invokeNoAuxTc<float, __nv_bfloat16, float, int32_t>(
+              reinterpret_cast<float*>(scores.data_ptr()),
+              reinterpret_cast<__nv_bfloat16*>(bias.data_ptr()),
+              reinterpret_cast<float*>(topk_values.data_ptr()),
+              reinterpret_cast<int32_t*>(topk_indices.data_ptr()), num_tokens, num_experts, n_group,
+              topk_group, topk, routed_scaling_factor, launch_with_pdl, stream);
+          break;
+        default:
+          throw std::invalid_argument(
+              "Invalid bias dtype, only supports float16, float32, and bfloat16");
+          break;
+      }
+      break;
+    case bfloat16_code:
+      // Handle BFloat16
+      switch (encode_dlpack_dtype(bias_type)) {
+        case bfloat16_code:
+          invokeNoAuxTc<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, int32_t>(
+              reinterpret_cast<__nv_bfloat16*>(scores.data_ptr()),
+              reinterpret_cast<__nv_bfloat16*>(bias.data_ptr()),
+              reinterpret_cast<__nv_bfloat16*>(topk_values.data_ptr()),
+              reinterpret_cast<int32_t*>(topk_indices.data_ptr()), num_tokens, num_experts, n_group,
+              topk_group, topk, routed_scaling_factor, launch_with_pdl, stream);
+          break;
+        case float16_code:
+          invokeNoAuxTc<__nv_bfloat16, half, __nv_bfloat16, int32_t>(
+              reinterpret_cast<__nv_bfloat16*>(scores.data_ptr()),
+              reinterpret_cast<half*>(bias.data_ptr()),
+              reinterpret_cast<__nv_bfloat16*>(topk_values.data_ptr()),
+              reinterpret_cast<int32_t*>(topk_indices.data_ptr()), num_tokens, num_experts, n_group,
+              topk_group, topk, routed_scaling_factor, launch_with_pdl, stream);
+          break;
+        case float32_code:
+          invokeNoAuxTc<__nv_bfloat16, float, __nv_bfloat16, int32_t>(
+              reinterpret_cast<__nv_bfloat16*>(scores.data_ptr()),
+              reinterpret_cast<float*>(bias.data_ptr()),
+              reinterpret_cast<__nv_bfloat16*>(topk_values.data_ptr()),
+              reinterpret_cast<int32_t*>(topk_indices.data_ptr()), num_tokens, num_experts, n_group,
+              topk_group, topk, routed_scaling_factor, launch_with_pdl, stream);
+          break;
+        default:
+          throw std::invalid_argument(
+              "Invalid bias dtype, only supports bfloat16, float16, and float32");
+          break;
+      }
+      break;
+    default:
+      // Handle other data types
+      throw std::invalid_argument("Invalid dtype, only supports float16, float32, and bfloat16");
+      break;
+  }
+}
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(NoAuxTc, flashinfer::trtllm_dsv3_fused_routing::NoAuxTc);
+}  // namespace flashinfer::trtllm_dsv3_fused_routing
diff --git a/csrc/gemm_groupwise_sm100.cu b/csrc/gemm_groupwise_sm100.cu
index 30e7329db7..6f8d3d433d 100644
--- a/csrc/gemm_groupwise_sm100.cu
+++ b/csrc/gemm_groupwise_sm100.cu
@@ -91,7 +91,7 @@ void CutlassGemmGroupwiseScaledSM100(TensorView float_workspace_buffer, TensorVi
                                      int64_t scale_granularity_m, int64_t scale_granularity_n,
                                      int64_t scale_granularity_k, std::string scale_major_mode,
                                      int64_t mma_sm) {
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(C.device());
   DISPATCH_SCALE_MAJOR_K(scale_major_mode, SCALE_MAJOR_K, [&] {
     return DISPATCH_MMA_SM(mma_sm, MMA_SM, [&] {
diff --git a/csrc/gemm_groupwise_sm120.cu b/csrc/gemm_groupwise_sm120.cu
index 28cbf58d33..14ad3730ac 100644
--- a/csrc/gemm_groupwise_sm120.cu
+++ b/csrc/gemm_groupwise_sm120.cu
@@ -86,7 +86,7 @@ void CutlassGemmGroupwiseScaledSM120(TensorView float_workspace_buffer, TensorVi
                                      TensorView SFA, TensorView SFB, TensorView C,
                                      int64_t scale_granularity_m, int64_t scale_granularity_n,
                                      int64_t scale_granularity_k, std::string scale_major_mode) {
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   auto stream = get_stream(C.device());
 
   // Ensure scales are contiguous
diff --git a/csrc/group_gemm.cu b/csrc/group_gemm.cu
index 100c7183f3..cc711cfd73 100644
--- a/csrc/group_gemm.cu
+++ b/csrc/group_gemm.cu
@@ -25,7 +25,7 @@ void CutlassSegmentGEMM(TensorView workspace_buffer, TensorView all_problems, Te
                         TensorView y_ld, TensorView empty_x_data, bool weight_column_major) {
   unsigned int batch_size = x_ptr.size(0);
 
-  cudaSetDevice(workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(workspace_buffer.device());
   DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(empty_x_data.dtype(), c_type, [&] {
     using cutlass_t = cutlass_dtype_t<c_type>;
diff --git a/csrc/group_gemm_fp8_groupwise_sm100.cu b/csrc/group_gemm_fp8_groupwise_sm100.cu
index 006314de71..9d3729b199 100644
--- a/csrc/group_gemm_fp8_groupwise_sm100.cu
+++ b/csrc/group_gemm_fp8_groupwise_sm100.cu
@@ -91,7 +91,7 @@ void CutlassGroupGemmFP8GroupwiseScaledSM100(
     TensorView SFA, TensorView SFB, TensorView D, TensorView m_indptr, int64_t n, int64_t k,
     int64_t scale_granularity_m, int64_t scale_granularity_n, int64_t scale_granularity_k,
     std::string scale_major_mode, int64_t mma_sm) {
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   auto stream = get_stream(D.device());
   int num_groups = m_indptr.size(0) - 1;
   int max_m = SFA.size(1);
diff --git a/csrc/group_gemm_fp8_groupwise_sm120.cu b/csrc/group_gemm_fp8_groupwise_sm120.cu
index c0bbaa31b2..b54c7d6acb 100644
--- a/csrc/group_gemm_fp8_groupwise_sm120.cu
+++ b/csrc/group_gemm_fp8_groupwise_sm120.cu
@@ -85,7 +85,7 @@ void CutlassGroupGemmFP8GroupwiseScaledSM120(
     TensorView SFA, TensorView SFB, TensorView D, TensorView m_indptr, int64_t n, int64_t k,
     int64_t scale_granularity_m, int64_t scale_granularity_n, int64_t scale_granularity_k,
     std::string scale_major_mode) {
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   auto stream = get_stream(D.device());
   int num_groups = m_indptr.size(0) - 1;
 
diff --git a/csrc/group_gemm_mxfp4_groupwise_sm100.cu b/csrc/group_gemm_mxfp4_groupwise_sm100.cu
index c4dc79e5d7..28ff9b3ef3 100644
--- a/csrc/group_gemm_mxfp4_groupwise_sm100.cu
+++ b/csrc/group_gemm_mxfp4_groupwise_sm100.cu
@@ -133,7 +133,7 @@ void CutlassGroupGemmMXFP4GroupwiseScaledSM100(TensorView int_workspace_buffer,
                                                TensorView D, TensorView m_indptr, int64_t n,
                                                int64_t k, int64_t mma_sm, int64_t tile_m,
                                                int64_t tile_n, int64_t tile_k, bool swap_ab) {
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   auto stream = get_stream(A.device());
   int num_groups = m_indptr.size(0) - 1;
   DISPATCH_DLPACK_INPUT_OUTPUT_DTYPE(
diff --git a/csrc/group_gemm_sm90.cu b/csrc/group_gemm_sm90.cu
index 9e2ee793e4..16e3ed7ed1 100644
--- a/csrc/group_gemm_sm90.cu
+++ b/csrc/group_gemm_sm90.cu
@@ -53,7 +53,7 @@ void CutlassSegmentGEMMSM90(TensorView float_workspace_buffer, TensorView int_wo
                             TensorView y_stride, TensorView empty_x_data, TensorView empty_y_data,
                             bool weight_column_major) {
   unsigned int batch_size = x_ptr.size(0);
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
   DISPATCH_DLPACK_INPUT_OUTPUT_DTYPE(
       empty_x_data.dtype(), empty_y_data.dtype(), c_type_in, c_type_out, [&] {
diff --git a/csrc/norm.cu b/csrc/norm.cu
index c5ff67eb10..1d93ffd1b7 100644
--- a/csrc/norm.cu
+++ b/csrc/norm.cu
@@ -36,7 +36,7 @@ void rmsnorm(TensorView output, TensorView input, TensorView weight, double eps,
     unsigned int hidden_size = input.size(1);
     TVM_FFI_ICHECK_EQ(output.size(0), batch_size);
     TVM_FFI_ICHECK_EQ(output.size(1), hidden_size);
-    cudaSetDevice(input.device().device_id);
+    ffi::CUDADeviceGuard device_guard(input.device().device_id);
     const cudaStream_t stream = get_stream(input.device());
 
     DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(input.dtype(), c_type, [&] {
@@ -60,7 +60,7 @@ void rmsnorm(TensorView output, TensorView input, TensorView weight, double eps,
     TVM_FFI_ICHECK_EQ(output.size(1), num_heads);
     TVM_FFI_ICHECK_EQ(output.size(2), hidden_size);
 
-    cudaSetDevice(input.device().device_id);
+    ffi::CUDADeviceGuard device_guard(input.device().device_id);
     const cudaStream_t stream = get_stream(input.device());
     DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(input.dtype(), c_type, [&] {
       cudaError_t status = norm::QKRMSNorm(
@@ -92,7 +92,7 @@ void fused_add_rmsnorm(TensorView input, TensorView residual, TensorView weight,
   TVM_FFI_ICHECK_EQ(residual.size(0), batch_size);
   TVM_FFI_ICHECK_EQ(residual.size(1), hidden_size);
   TVM_FFI_ICHECK_EQ(weight.size(0), hidden_size);
-  cudaSetDevice(input.device().device_id);
+  ffi::CUDADeviceGuard device_guard(input.device().device_id);
   const cudaStream_t stream = get_stream(input.device());
 
   DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(input.dtype(), c_type, [&] {
@@ -119,7 +119,7 @@ void gemma_rmsnorm(TensorView output, TensorView input, TensorView weight, doubl
   unsigned int hidden_size = input.size(1);
   TVM_FFI_ICHECK_EQ(output.size(0), batch_size);
   TVM_FFI_ICHECK_EQ(output.size(1), hidden_size);
-  cudaSetDevice(input.device().device_id);
+  ffi::CUDADeviceGuard device_guard(input.device().device_id);
   const cudaStream_t stream = get_stream(input.device());
 
   DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(input.dtype(), c_type, [&] {
@@ -148,7 +148,7 @@ void gemma_fused_add_rmsnorm(TensorView input, TensorView residual, TensorView w
   TVM_FFI_ICHECK_EQ(residual.size(0), batch_size);
   TVM_FFI_ICHECK_EQ(residual.size(1), hidden_size);
   TVM_FFI_ICHECK_EQ(weight.size(0), hidden_size);
-  cudaSetDevice(input.device().device_id);
+  ffi::CUDADeviceGuard device_guard(input.device().device_id);
   const cudaStream_t stream = get_stream(input.device());
 
   DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(input.dtype(), c_type, [&] {
@@ -177,7 +177,7 @@ void layernorm(Tensor output, Tensor input, Tensor gamma, Tensor beta, double ep
   unsigned int hidden_size = input.size(1);
   TVM_FFI_ICHECK_EQ(output.size(0), batch_size);
   TVM_FFI_ICHECK_EQ(output.size(1), hidden_size);
-  cudaSetDevice(input.device().device_id);
+  ffi::CUDADeviceGuard device_guard(input.device().device_id);
   const cudaStream_t stream = get_stream(input.device());
   // TODO(kaixih): This is currently our only use case; Add more if needed.
   TVM_FFI_ICHECK_EQ(input.dtype(), dl_bfloat16) << "input must be bfloat16";
diff --git a/csrc/nv_internal/cpp/kernels/quantization.cu b/csrc/nv_internal/cpp/kernels/quantization.cu
index 458cafd2f6..ca1bb31acd 100644
--- a/csrc/nv_internal/cpp/kernels/quantization.cu
+++ b/csrc/nv_internal/cpp/kernels/quantization.cu
@@ -70,6 +70,21 @@ template void invokeQuantization<__nv_bfloat16>(int8_t* dst, __nv_bfloat16 const
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Helper function for grid configuration with swizzled layouts
+
+inline int computeEffectiveRows(int m, QuantizationSFLayout layout) {
+  int effectiveRows = m;
+  bool isSfSwizzledLayout = (layout == QuantizationSFLayout::SWIZZLED_128x4 ||
+                             layout == QuantizationSFLayout::SWIZZLED_8x4);
+  if (isSfSwizzledLayout) {
+    int rowTile = (layout == QuantizationSFLayout::SWIZZLED_128x4) ? 128 : 8;
+    int numPaddedRows = (m + rowTile - 1) / rowTile * rowTile;  // Round up to rowTile
+    effectiveRows = numPaddedRows;
+  }
+  return effectiveRows;
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // MXFP8 Quantization
 
@@ -85,7 +100,8 @@ void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input,
   dim3 block(std::min(int(padded_n / CVT_ELTS_PER_THREAD), 512));
   // Get number of blocks per SM (assume we can fully utilize the SM).
   int const numBlocksPerSM = std::max(1u, 2048u / block.x);
-  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+  int effectiveRows = computeEffectiveRows(m, layout);
+  dim3 grid(std::min(effectiveRows, multiProcessorCount * numBlocksPerSM));
 
   // Launch the cvt kernel.
   cudaLaunchConfig_t config;
@@ -177,7 +193,8 @@ void invokeFP4Quantization(int b, int m, int n, T const* input, float const* SFS
     dim3 block(std::min(int(n / CVT_FP8_TO_FP4_ELTS_PER_THREAD), 512));
     // Get number of blocks per SM (assume we can fully utilize the SM).
     int const numBlocksPerSM = std::max(1u, 2048u / block.x);
-    dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+    int effectiveRows = computeEffectiveRows(m, layout);
+    dim3 grid(std::min(effectiveRows, multiProcessorCount * numBlocksPerSM));
 
     // Launch the cvt kernel.
     auto* kernel_instance = useUE8M0
@@ -197,7 +214,8 @@ void invokeFP4Quantization(int b, int m, int n, T const* input, float const* SFS
     dim3 block(std::min(int(n / CVT_ELTS_PER_THREAD), 512));
     // Get number of blocks per SM (assume we can fully utilize the SM).
     int const numBlocksPerSM = std::max(1u, 2048u / block.x);
-    dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+    int effectiveRows = computeEffectiveRows(m, layout);
+    dim3 grid(std::min(effectiveRows, multiProcessorCount * numBlocksPerSM));
 
     // Launch the cvt kernel.
     auto* kernel_instance = useUE8M0
@@ -222,13 +240,14 @@ void invokeFP4Quantization(int b, int m, int n, T const* input, float const* SFS
   }
 }
 
+template <typename T>
 __global__ void block_scale_interleave_kernel(int numBatches, int numRows, int numRowsPadded,
-                                              int numCols, int numColsPadded, uint8_t const* SFIn,
-                                              uint8_t* SFOutput) {
+                                              int numCols, int numColsPadded, T const* SFIn,
+                                              T* SFOutput) {
   for (int rowIdx = blockIdx.x; rowIdx < numRowsPadded; rowIdx += gridDim.x) {
     for (int batchIdx = 0; batchIdx < numBatches; batchIdx++) {
       for (int colIdx = threadIdx.x; colIdx < numColsPadded; colIdx += blockDim.x) {
-        uint8_t sf = 0;
+        T sf = 0;
         if (rowIdx < numRows && colIdx < numCols) {
           int64_t inOffset = batchIdx * numRows * numCols + rowIdx * numCols + colIdx;
           sf = SFIn[inOffset];
@@ -269,19 +288,29 @@ __global__ void block_scale_interleave_reverse_kernel(int numBatches, int numRow
 }
 
 // This is intended for weight loading, so m and n are large, b <= 256
-void invokeBlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded,
-                                uint8_t const* SFIn, uint8_t* SFOutput, int multiProcessorCount,
-                                cudaStream_t stream) {
+template <typename T>
+void invokeBlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded, T const* SFIn,
+                                T* SFOutput, int multiProcessorCount, cudaStream_t stream) {
   // Each thread reads 1 int8 value
   dim3 block(std::min(n_padded, 1024));
   // Get number of blocks per SM (assume we can fully utilize the SM).
   int const numBlocksPerSM = std::max(1u, 4096u / block.x);
   dim3 grid(std::min(m_padded, multiProcessorCount * numBlocksPerSM));
 
-  block_scale_interleave_kernel<<<grid, block, 0, stream>>>(b, m, m_padded, n, n_padded, SFIn,
-                                                            SFOutput);
+  block_scale_interleave_kernel<T>
+      <<<grid, block, 0, stream>>>(b, m, m_padded, n, n_padded, SFIn, SFOutput);
 }
 
+// Explicit template instantiations for the types used by other compilation units
+template void invokeBlockScaleInterleave<uint8_t>(int b, int m, int m_padded, int n, int n_padded,
+                                                  uint8_t const* SFIn, uint8_t* SFOutput,
+                                                  int multiProcessorCount, cudaStream_t stream);
+template void invokeBlockScaleInterleave<__nv_bfloat16>(int b, int m, int m_padded, int n,
+                                                        int n_padded, __nv_bfloat16 const* SFIn,
+                                                        __nv_bfloat16* SFOutput,
+                                                        int multiProcessorCount,
+                                                        cudaStream_t stream);
+
 // This is intended for weight loading, so m and n are large, b <= 256
 void invokeBlockScaleInterleaveReverse(int b, int m, int n, uint8_t const* SFIn, uint8_t* SFOutput,
                                        int multiProcessorCount, cudaStream_t stream) {
diff --git a/csrc/nv_internal/include/tensorrt_llm/common/cudaUtils.h b/csrc/nv_internal/include/tensorrt_llm/common/cudaUtils.h
index ccddbc1ef5..5f757f1b51 100644
--- a/csrc/nv_internal/include/tensorrt_llm/common/cudaUtils.h
+++ b/csrc/nv_internal/include/tensorrt_llm/common/cudaUtils.h
@@ -1181,6 +1181,9 @@ using Int = ConstExprWrapper<int, VALUE>;
 template <bool VALUE>
 using Bool = ConstExprWrapper<bool, VALUE>;
 
+template <bool VALUE>
+using ConstBool = ConstExprWrapper<bool, VALUE>;
+
 template <typename T>
 struct TmaDescType;
 
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/fusion/sm90_visitor_scatter.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/fusion/sm90_visitor_scatter.hpp
new file mode 100644
index 0000000000..c98f7ee3c1
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/fusion/sm90_visitor_scatter.hpp
@@ -0,0 +1,757 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree store operations for the sm90 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/fusion/operations.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+#include "cutlass_extensions/arch/copy_red_global.hpp"
+#include "cutlass_extensions/util/gather_tensor.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// clang-format off
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+template <
+  class EpilogueTile,
+  class StrideOutput,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  class ElementOutput,
+  int AlignmentOutput = 128 / cute::sizeof_bits_v<ElementOutput>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+struct Sm90ScatterPtrArray {
+
+  using SmemShape = decltype(make_shape(size(make_layout(get<0>(EpilogueTile{}))), size(make_layout(get<1>(EpilogueTile{})))));
+  using SmemLayout = decltype(tile_to_shape(SmemLayoutAtom{}, SmemShape{}));
+
+  using ElementIndex = int32_t;
+
+  static constexpr bool MajorMode = cutlass::gemm::detail::is_major<0,StrideOutput>() ? 0 : 1;
+
+  using StrideIndex = decltype(replace<1-MajorMode>(Stride<_0,_0,_0>{}, Int<1>{}));
+
+  struct SharedStorage {};
+
+  struct Arguments {
+    ElementOutput* ptr_out{};               // output tensor pointer
+    StrideOutput dOut = {};                 // output tensor stride
+    ElementIndex const* const* ptr_index{}; // per-group pointer to the scatter index
+    int index_modulo{};                     // modulo used to transform the index before store
+    int shape_override = -1;                // override value for contiguous output tensor mode
+    bool use_reduction = true;              // use reduction or regular store
+  };
+
+  struct Params {
+    ElementOutput* ptr_out{};               // output tensor pointer
+    StrideOutput dOut = {};                 // output tensor stride
+    ElementIndex const* const* ptr_index{}; // per-group pointer to the scatter index
+    cutlass::FastDivmod index_divmod{};     // modulo used to transform the index before store
+    int shape_override = -1;                // override value for contiguous output tensor mode
+    bool use_reduction = true;              // use reduction or regular store
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return {
+      args.ptr_out,
+      args.dOut,
+      args.ptr_index,
+      cutlass::FastDivmod(args.index_modulo),
+      args.shape_override,
+      args.use_reduction
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScatterPtrArray() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScatterPtrArray(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<
+    class ArgsTuple
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(ArgsTuple&& args_tuple)
+      : args_tuple(std::move(args_tuple)) {}
+
+    ArgsTuple args_tuple;
+
+    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+
+      auto& [tC_rOut, tiled_r2s, tRG_gOut, tRG_cD, tiled_r2g_red, tiled_r2g_stg, use_reduction, thread_idx, residue_cD] = args_tuple;
+
+      using ConvertInput = NumericArrayConverter<ElementOutput, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+
+      Tensor tC_rOut_frg = recast<Array<ElementOutput, FragmentSize>>(coalesce(tC_rOut)); // (EPI_V)
+      tC_rOut_frg(epi_v) = convert_input(frg_input);
+
+      return tC_rOut_frg(epi_v);
+    }
+
+    template <class STensor, class SyncFn, class VTensor>
+    CUTLASS_DEVICE void
+    reduce(STensor&& reduction_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
+
+      auto& [tC_rOut, tiled_r2s, tRG_gOut, tRG_cD, tiled_r2g_red, tiled_r2g_stg, use_reduction, thread_idx, residue_cD] = args_tuple;
+
+      Tensor byte_buffer = recast<uint8_t>(reduction_buffer);
+      static_assert(cosize(byte_buffer.layout()) * sizeof_bits_v<uint8_t> >= cosize(SmemLayout{}) * sizeof_bits_v<ElementOutput>,
+                    "Not enough space in scratch smem buffer");
+
+      Tensor sOut = as_position_independent_swizzle_tensor(make_tensor(make_smem_ptr(recast_ptr<ElementOutput>(byte_buffer.data())), SmemLayout{}));
+
+      auto thread_r2s = tiled_r2s.get_slice(thread_idx);
+      Tensor tRS_sOut_epi = thread_r2s.partition_D(sOut);
+      Tensor tRS_rOut_epi = thread_r2s.retile_S(tC_rOut);
+
+      auto thread_r2g = tiled_r2g_red.get_slice(thread_idx);
+      Tensor tRG_gOut_epi = tRG_gOut(_,_,_,epi_m,epi_n);
+      Tensor tRG_sOut_epi = thread_r2g.partition_D(sOut);
+      Tensor tRG_rOut_epi = thread_r2g.retile_S(make_tensor(tC_rOut.data(), shape(tRG_sOut_epi))); // reuse D registers
+
+      // sanity check for register reuse
+      CUTE_STATIC_ASSERT_V(cosize(tC_rOut.layout()) == cosize(tRG_rOut_epi.layout()), "Invalid register count for R2G");
+
+      copy(tiled_r2s, tRS_rOut_epi, tRS_sOut_epi);
+      sync_fn();
+      copy(tRG_sOut_epi, tRG_rOut_epi);
+
+      auto residue = residue_cD; // capturing structured bindings is a C++20 feature
+      Tensor tRG_cD_epi = tRG_cD(0,_,_,epi_m,epi_n);
+      auto pred = cute::lazy::transform(tRG_cD_epi, [&](auto c){ return elem_less(c, residue); });
+
+      if (use_reduction) {
+        copy_if(tiled_r2g_red, pred, tRG_rOut_epi, tRG_gOut_epi);
+      }
+      else {
+        copy_if(tiled_r2g_stg, pred, tRG_rOut_epi, tRG_gOut_epi);
+      }
+    }
+  };
+
+  template <class Element, int MaxVecSize>
+  static constexpr auto get_reduction_op()
+  {
+      using namespace cute;
+
+      // For now only support red.add
+      if constexpr (is_same_v<Element, cutlass::half_t>) {
+        if constexpr (MaxVecSize % 8 == 0) {
+          return SM90_RED_ADD_NOFTZ_F16x2_V4{};
+        }
+        else if constexpr (MaxVecSize % 4 == 0) {
+          return SM90_RED_ADD_NOFTZ_F16x2_V2{};
+        }
+        else if constexpr (MaxVecSize % 2 == 0) {
+          return SM70_RED_ADD_NOFTZ_F16x2{};
+        }
+        else {
+          return SM70_RED_ADD_NOFTZ_F16{};
+        }
+      }
+      else if constexpr (is_same_v<Element, cutlass::bfloat16_t>) {
+        if constexpr (MaxVecSize % 8 == 0) {
+          return SM90_RED_ADD_NOFTZ_BF16x2_V4{};
+        }
+        else if constexpr (MaxVecSize % 4 == 0) {
+          return SM90_RED_ADD_NOFTZ_BF16x2_V2{};
+        }
+        else if constexpr (MaxVecSize % 2 == 0) {
+          return SM90_RED_ADD_NOFTZ_BF16x2{};
+        }
+        else {
+          return SM90_RED_ADD_NOFTZ_BF16{};
+        }
+      }
+      else {
+        // non-vectorized atomic add for all other types until supported
+        return TypedAtomicAdd<Element>{};
+      }
+  }
+
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    auto index_read = [index = params_ptr->ptr_index[l], divmod = params_ptr->index_divmod](auto i){ return divmod.rem(index[i]); };
+    Tensor mOut = cutlass::util::make_gather_tensor(params_ptr->ptr_out, make_shape(M,N,Int<1>{}), params_ptr->dOut, index_read); // (M,N,_1)
+    Tensor gOut = local_tile(mOut, take<0,2>(args.tile_shape_mnk), make_coord(m,n,Int<0>{}));                               // (CTA_M,CTA_N)
+    Tensor gOut_epi = flat_divide(gOut, args.epi_tile);                                                                     // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    Tensor mIdx = make_tensor(params_ptr->ptr_index[l], make_shape(M,N,Int<1>{}), StrideIndex{});                           // (M,N,_1)
+    Tensor gIdx = local_tile(mIdx, take<0,2>(args.tile_shape_mnk), make_coord(m,n,Int<0>{}));                               // (CTA_M,CTA_N)
+    Tensor gIdx_epi = flat_divide(gIdx, args.epi_tile);                                                                     // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    Tensor cD_epi = flat_divide(args.cD, args.epi_tile);                                                                    // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    Tensor tC_gOut = sm90_partition_for_epilogue<ReferenceSrc>(gOut, args.epi_tile, args.tiled_copy, args.thread_idx);      // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    Tensor tC_rOut = make_tensor<ElementOutput>(take<0,3>(shape(tC_gOut)));                                                 // (CPY,CPY_M,CPY_N)
+
+    auto tiled_r2s = conditional_return<ReferenceSrc>(
+      make_tiled_copy_S(Copy_Atom<CopyOpR2S,ElementOutput>{}, args.tiled_copy),
+      make_tiled_copy_D(Copy_Atom<CopyOpR2S,ElementOutput>{}, args.tiled_copy)
+    );
+
+    // Vectorization must not exceed alignment and also the number of values per thread in the tile
+    int constexpr NumThreads = CUTE_STATIC_V(size(args.tiled_copy));
+    int constexpr NumValTile = product(take<0,2>(shape(cD_epi)));
+    int constexpr MaxVecSize = cute::min(AlignmentOutput, NumValTile / NumThreads);
+
+    // Choose the largest available red.global op and an st.global op with matching vectorization
+    using CopyOpR2GRed = decltype(get_reduction_op<ElementOutput, MaxVecSize>());
+    using CopyOpR2GStg = UniversalCopy<uint_bit_t<Copy_Atom<CopyOpR2GRed,ElementOutput>::NumValSrc * sizeof_bits_v<ElementOutput>>>;
+
+    auto make_tiled_r2g = [&](auto copy_op)
+    {
+      using CopyAtomR2G = Copy_Atom<decltype(copy_op),ElementOutput>;
+      constexpr int VecSize = CopyAtomR2G::NumValSrc;
+      if constexpr (cutlass::gemm::detail::is_k_major<StrideOutput>()) {
+        constexpr int ThreadsMajor = size<1>(args.epi_tile) / VecSize;
+        constexpr int ThreadsMinor = NumThreads / ThreadsMajor;
+        return make_tiled_copy(CopyAtomR2G{},
+          Layout<Shape<Int<ThreadsMinor>, Int<ThreadsMajor>>, Stride<Int<ThreadsMajor>, _1>>{},
+          Layout<Shape<_1, Int<VecSize>>>{});
+      }
+      else if constexpr (cutlass::gemm::detail::is_mn_major<StrideOutput>()) {
+        constexpr int ThreadsMajor = size<0>(args.epi_tile) / VecSize;
+        constexpr int ThreadsMinor = NumThreads / ThreadsMajor;
+        return make_tiled_copy(CopyAtomR2G{},
+          Layout<Shape<Int<ThreadsMajor>, Int<ThreadsMinor>>, Stride<_1, Int<ThreadsMajor>>>{},
+          Layout<Shape<Int<VecSize>, _1>>{});
+      }
+      else {
+        static_assert(cute::is_void_v<StrideOutput>, "Unsupported D gmem layout.");
+      }
+    };
+
+    auto tiled_r2g_red = make_tiled_r2g(CopyOpR2GRed{});
+    auto tiled_r2g_stg = make_tiled_r2g(CopyOpR2GStg{});
+
+    // Sanity checks - since we will be using one tiled copy with tensors partitioned with the other tiled copy,
+    // ensure they have matching layouts/tilers
+    using TiledR2GRed = decltype(tiled_r2g_red);
+    using TiledR2GStg = decltype(tiled_r2g_stg);
+    static_assert(typename TiledR2GRed::AtomLayoutSrc{} == typename TiledR2GStg::AtomLayoutSrc{}, "Mismatching AtomLayoutSrc");
+    static_assert(typename TiledR2GRed::AtomLayoutDst{} == typename TiledR2GStg::AtomLayoutDst{}, "Mismatching AtomLayoutDst");
+    static_assert(typename TiledR2GRed::TiledLayout_TV{} == typename TiledR2GStg::TiledLayout_TV{}, "Mismatching TiledLayout_TV");
+    static_assert(typename TiledR2GRed::Tiler_MN{} == typename TiledR2GStg::Tiler_MN{}, "Mismatching Tiler_MN");
+
+    auto thread_r2g = tiled_r2g_red.get_slice(args.thread_idx);
+    Tensor tRG_gOut = thread_r2g.partition_D(gOut_epi);                      // (R2G,R2G_M,R2G_N,EPI_M,EPI_N)
+    Tensor tRG_cD = thread_r2g.partition_D(cD_epi);                          // (R2G,R2G_M,R2G_N,EPI_M,EPI_N)
+
+    auto residue_cD = args.residue_cD;
+
+    // If shape_override is set, adjust residue_cD to change predication.
+    // This is used to support fused slicing (where the output tensor is smaller than problem shape)
+    if (params_ptr->shape_override >= 0) {
+      get<MajorMode>(residue_cD) += params_ptr->shape_override - get<MajorMode>(args.problem_shape_mnkl);
+    }
+
+    auto args_tuple = make_tuple(
+      cute::move(tC_rOut),
+      tiled_r2s,
+      tRG_gOut,
+      tRG_cD,
+      tiled_r2g_red,
+      tiled_r2g_stg,
+      params_ptr->use_reduction,
+      args.thread_idx,
+      residue_cD);
+
+    return ConsumerStoreCallbacks<decltype(args_tuple)>(std::move(args_tuple));
+  }
+};
+
+template<
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct ScaledAccPerRowBias
+    : ScaledAcc<ElementOutput_, ElementCompute_, ElementScalar_, RoundStyle_>
+{
+  using ElementBias = ElementBias_;
+  static constexpr int AlignmentBias = AlignmentBias_;
+  static constexpr bool IsPerRowBiasSupported = true;
+};
+
+template<
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct ScaledAccPerColBias
+    : ScaledAcc<ElementOutput_, ElementCompute_, ElementScalar_, RoundStyle_>
+{
+  using ElementBias = ElementBias_;
+  static constexpr int AlignmentBias = AlignmentBias_;
+  static constexpr bool IsPerColBiasSupported = true;
+};
+
+template<
+  class GmemLayoutTagOut,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementScale = ElementCompute,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / cute::sizeof_bits_v<ElementBias>,
+  int AlignmentOutput = 128 / cute::sizeof_bits_v<ElementOutput>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+struct ScaledAccPerRowBiasPerColScaleScatter
+    : ScaledAccPerRowBias<ElementOutput, ElementCompute, ElementBias, ElementScalar, AlignmentBias, RoundStyle>
+{
+  using ElementAux = ElementOutput;
+  using GmemLayoutTagAux = GmemLayoutTagOut;
+  static constexpr int AlignmentAux = AlignmentOutput;
+  static constexpr bool IsAuxOutSupported = true;
+};
+
+template<
+  class GmemLayoutTagOut,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementScale = ElementCompute,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / cute::sizeof_bits_v<ElementBias>,
+  int AlignmentOutput = 128 / cute::sizeof_bits_v<ElementOutput>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+struct ScaledAccPerColBiasPerRowScaleScatter
+    : ScaledAccPerColBias<ElementOutput, ElementCompute, ElementBias, ElementScalar, AlignmentBias, RoundStyle>
+{
+  using ElementAux = ElementOutput;
+  using GmemLayoutTagAux = GmemLayoutTagOut;
+  static constexpr int AlignmentAux = AlignmentOutput;
+  static constexpr bool IsAuxOutSupported = true;
+};
+
+// D = alpha * acc + per-row bias
+template<
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledAccPerRowBiasPtrArray =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // alpha * acc + bias
+    Sm90ScalarBroadcastPtrArray<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
+    Sm90AccFetch, // acc
+    Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias *, ElementCompute, Stride<_1,_0,int64_t>, AlignmentBias> // bias
+  >;
+
+template<
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledAccPerColBiasPtrArray =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // alpha * acc + bias
+    Sm90ScalarBroadcastPtrArray<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
+    Sm90AccFetch, // acc
+    Sm90RowBroadcast<0, CtaTileShapeMNK, ElementBias *, ElementCompute, Stride<_0,_1,int64_t>, AlignmentBias> // bias
+  >;
+
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class StrideOutput,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementScale = ElementCompute,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / cute::sizeof_bits_v<ElementBias>,
+  int AlignmentOutput = 128 / cute::sizeof_bits_v<ElementOutput>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledAccPerRowBiasPerColScaleScatterPtrArray =
+  Sm90EVT<Sm90ScatterPtrArray<EpilogueTile, StrideOutput, SmemLayoutAtom, CopyOpR2S, ElementOutput, AlignmentOutput, RoundStyle>, // scatter store
+    Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // scale * (alpha * acc + bias)
+      Sm90RowBroadcast<0, CtaTileShapeMNK, ElementScalar *, ElementCompute, Stride<_0,_1,int64_t>, 1>, // scale
+      Sm90ScaledAccPerRowBiasPtrArray<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementScalar, AlignmentBias, RoundStyle> // alpha * acc + bias
+    >
+  >;
+
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class StrideOutput,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementScale = ElementCompute,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / cute::sizeof_bits_v<ElementBias>,
+  int AlignmentOutput = 128 / cute::sizeof_bits_v<ElementOutput>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledAccPerColBiasPerRowScaleScatterPtrArray =
+  Sm90EVT<Sm90ScatterPtrArray<EpilogueTile, StrideOutput, SmemLayoutAtom, CopyOpR2S, ElementOutput, AlignmentOutput, RoundStyle>, // scatter store
+    Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // scale * (alpha * acc + bias)
+      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementScalar *, ElementCompute, Stride<_1,_0,int64_t>, 1>, // scale
+      Sm90ScaledAccPerColBiasPtrArray<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementScalar, AlignmentBias, RoundStyle> // alpha * acc + bias
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups,
+  class GmemLayoutTagOut,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementScale,
+  class ElementScalar,
+  int AlignmentBias,
+  int AlignmentOutput,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpR2S
+>
+struct FusionCallbacks<
+    epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC,
+                                             StagesD,
+                                             FragmentSize,
+                                             ReuseSmemC,
+                                             DelayTmaStore,
+                                             NumEpilogueWarpGroups
+                                            >,
+    fusion::ScaledAccPerRowBiasPerColScaleScatter<GmemLayoutTagOut,
+                                                  ElementOutput,
+                                                  ElementCompute,
+                                                  ElementBias,
+                                                  ElementScale,
+                                                  ElementScalar,
+                                                  AlignmentBias,
+                                                  AlignmentOutput,
+                                                  RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpR2S
+> : Sm90ScaledAccPerRowBiasPerColScaleScatterPtrArray<
+      CtaTileShapeMNK,
+      EpilogueTile,
+      cutlass::gemm::TagToStrideC_t<GmemLayoutTagOut>,
+      SmemLayoutAtom, CopyOpR2S,
+      ElementOutput, ElementCompute, ElementBias, ElementScale, ElementScalar,
+      AlignmentBias, AlignmentOutput, RoundStyle
+    > {
+
+  using StrideOutput = cutlass::gemm::TagToStrideC_t<GmemLayoutTagOut>;
+
+  using Impl = Sm90ScaledAccPerRowBiasPerColScaleScatterPtrArray<
+    CtaTileShapeMNK,
+    EpilogueTile,
+    StrideOutput,
+    SmemLayoutAtom, CopyOpR2S,
+    ElementOutput, ElementCompute, ElementBias, ElementScale, ElementScalar,
+    AlignmentBias, AlignmentOutput, RoundStyle
+  >;
+  using Operation = fusion::ScaledAccPerRowBiasPerColScaleScatter<
+    GmemLayoutTagOut,
+    ElementOutput,
+    ElementCompute,
+    ElementBias,
+    ElementScale,
+    ElementScalar,
+    AlignmentBias,
+    AlignmentOutput,
+    RoundStyle>;
+
+  struct Arguments {
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar const* alpha_ptr{};
+    ElementScalar const* const* alpha_ptr_array{};
+    StrideAlpha dAlpha{};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* const* bias_ptr{};
+    StrideBias dBias{};
+
+    using StrideScale = Stride<_0,_1,int64_t>;
+    ElementScalar const* const* scale_ptr_array{};
+    StrideScale dScale{};
+
+    // Nested args not usable due to a compiler bug with constexpr evaluation
+    // using ScatterArguments = typename Sm90ScatterPtrArray<EpilogueTile, StrideOutput, SmemLayoutAtom, CopyOpR2S, ElementOutput, AlignmentOutput, RoundStyle>::Arguments;
+    // ScatterArguments scatter{};
+
+    ElementOutput* ptr_out{};      // output tensor pointer
+    StrideOutput dOut{};           // output tensor stride
+    int const* const* ptr_index{}; // per-group pointer to the scatter index
+    int index_modulo{};            // modulo used to transform the index before store
+    int shape_override = -1;       // override value for contiguous output tensor mode
+    bool use_reduction = true;     // use reduction or regular store
+
+    operator typename Impl::Arguments() const {
+      return
+        {                                                           // unary op: reduce(scale * (beta * C + (alpha * acc)))
+          {                                                             // binary op: scale * (beta * C + (alpha * acc))
+            { scale_ptr_array, ElementScalar(1), dScale },                  // leaf args : scale broadcast
+            {                                                                 // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {alpha_ptr_array}, {dAlpha}},                // leaf args : alpha
+              {},                                                                 // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias},                                  // leaf args : bias
+              {}                                                                  // ternary args : multiply_add
+            },                                                                // end binary op
+            {}                                                              // binary args: multiply
+          },                                                            // end binary op
+          //scatter                                                       // unary args: reduce
+          { ptr_out, dOut, ptr_index, index_modulo, shape_override, use_reduction }
+        };                                                          // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+
+};
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups,
+  class GmemLayoutTagOut,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementScale,
+  class ElementScalar,
+  int AlignmentBias,
+  int AlignmentOutput,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpR2S
+>
+struct FusionCallbacks<
+    epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC,
+                                             StagesD,
+                                             FragmentSize,
+                                             ReuseSmemC,
+                                             DelayTmaStore,
+                                             NumEpilogueWarpGroups
+                                            >,
+    fusion::ScaledAccPerColBiasPerRowScaleScatter<GmemLayoutTagOut,
+                                                  ElementOutput,
+                                                  ElementCompute,
+                                                  ElementBias,
+                                                  ElementScale,
+                                                  ElementScalar,
+                                                  AlignmentBias,
+                                                  AlignmentOutput,
+                                                  RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpR2S
+> : Sm90ScaledAccPerColBiasPerRowScaleScatterPtrArray<
+      CtaTileShapeMNK,
+      EpilogueTile,
+      cutlass::gemm::TagToStrideC_t<GmemLayoutTagOut>,
+      SmemLayoutAtom, CopyOpR2S,
+      ElementOutput, ElementCompute, ElementBias, ElementScale, ElementScalar,
+      AlignmentBias, AlignmentOutput, RoundStyle
+    > {
+
+  using StrideOutput = cutlass::gemm::TagToStrideC_t<GmemLayoutTagOut>;
+
+  using Impl = Sm90ScaledAccPerColBiasPerRowScaleScatterPtrArray<
+    CtaTileShapeMNK,
+    EpilogueTile,
+    StrideOutput,
+    SmemLayoutAtom, CopyOpR2S,
+    ElementOutput, ElementCompute, ElementBias, ElementScale, ElementScalar,
+    AlignmentBias, AlignmentOutput, RoundStyle
+  >;
+  using Operation = fusion::ScaledAccPerColBiasPerRowScaleScatter<
+    GmemLayoutTagOut,
+    ElementOutput,
+    ElementCompute,
+    ElementBias,
+    ElementScale,
+    ElementScalar,
+    AlignmentBias,
+    AlignmentOutput,
+    RoundStyle>;
+
+  struct Arguments {
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar const* alpha_ptr{};
+    ElementScalar const* const* alpha_ptr_array{};
+    StrideAlpha dAlpha{};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* const* bias_ptr{};
+    StrideBias dBias{};
+
+    using StrideScale = Stride<_1,_0,int64_t>;
+    ElementScalar const* const* scale_ptr_array{};
+    StrideScale dScale{};
+
+    // Nested args not usable due to a compiler bug with constexpr evaluation
+    // using ScatterArguments = typename Sm90ScatterPtrArray<EpilogueTile, StrideOutput, SmemLayoutAtom, CopyOpR2S, ElementOutput, AlignmentOutput, RoundStyle>::Arguments;
+    // ScatterArguments scatter{};
+
+    ElementOutput* ptr_out = nullptr;
+    StrideOutput dOut = {};
+    int const* const* ptr_index{};   // per-group pointer to the scatter index
+    int index_modulo{}; // modulo used to transform the index before store
+    int shape_override = -1; // override value for contiguous output tensor mode
+    bool use_reduction = true;
+
+    operator typename Impl::Arguments() const {
+      return
+        {                                                           // unary op: reduce(scale * (beta * C + (alpha * acc)))
+          {                                                             // binary op: scale * (beta * C + (alpha * acc))
+            { scale_ptr_array, ElementScalar(1), dScale },                  // leaf args : scale broadcast
+            {                                                                 // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {alpha_ptr_array}, {dAlpha}},                // leaf args : alpha
+              {},                                                                 // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias},                                  // leaf args : bias
+              {}                                                                  // ternary args : multiply_add
+            },                                                                // end binary op
+            {}                                                              // binary args: multiply
+          },                                                            // end binary op
+          //scatter                                                       // unary args: reduce
+          { ptr_out, dOut, ptr_index, index_modulo, shape_override, use_reduction }
+        };                                                          // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+
+};
+
+} // namespace cutlass::epilogue::fusion
+
+// clang-format on
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
index 6c5a823e8e..b2301c1a82 100644
--- a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,8 +19,12 @@
 #include <iostream>
 #include <sstream>
 #include <string>
+#include <tuple>
+#include <type_traits>
 
 #include "cute/tensor.hpp"
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/tllmException.h"
 
 namespace tensorrt_llm {
 namespace cutlass_extensions {
@@ -30,10 +34,10 @@ namespace cutlass_extensions {
 //       in the kernel layout details when doing weight only quantization.
 enum class CutlassTileConfig {
   // Signals that we should run heuristics do choose a config
-  Undefined,
+  Undefined = 0,
 
   // Signals that we should run heuristics do choose a config
-  ChooseWithHeuristic,
+  ChooseWithHeuristic = 1,
 
   // SiMT config
   CtaShape128x128x8_WarpShape64x64x8,
@@ -77,77 +81,96 @@ enum class SplitKStyle {
              // SPLIT_K_PARALLEL // Not supported yet
 };
 
-enum class CutlassTileConfigSM90 {
+constexpr static int shape_tuple_to_enum(int m, int n, int k) {
+  assert(m >= 0 && n >= 0 && k >= 0);
+  assert(m < 1000 && n < 1000 && k < 1000);
+  return m * 1000000 + n * 1000 + k;
+}
+
+template <typename TEnum>
+constexpr static std::tuple<int, int, int> enum_to_shape_tuple(TEnum shape_id_enum) {
+  static_assert(std::is_enum_v<TEnum> && std::is_same_v<std::underlying_type_t<TEnum>, int>,
+                "TEnum must be an enum with underlying type int");
+  auto shape_id = static_cast<int>(shape_id_enum);
+  assert(shape_id >= 0);
+  assert(shape_id < (int)1e9);
+  return std::make_tuple(shape_id / 1000000, (shape_id % 1000000) / 1000, shape_id % 1000);
+}
+
+enum class CutlassTileConfigSM90 : int {
   // Signals that we should run heuristics do choose a config
-  Undefined,
+  Undefined = 0,
 
   // Signals that we should run heuristics do choose a config
-  ChooseWithHeuristic,
+  ChooseWithHeuristic = 1,
 
   // CTA configs for M=64
-  CtaShape64x16x128B,
-  CtaShape64x32x128B,
-  CtaShape64x64x128B,
-  CtaShape64x128x128B,
-  CtaShape64x256x128B,
+  CtaShape64x16x128B = shape_tuple_to_enum(64, 16, 128),
+  CtaShape64x32x128B = shape_tuple_to_enum(64, 32, 128),
+  CtaShape64x64x128B = shape_tuple_to_enum(64, 64, 128),
+  CtaShape64x128x128B = shape_tuple_to_enum(64, 128, 128),
+  CtaShape64x256x128B = shape_tuple_to_enum(64, 256, 128),
 
   // CTA configs for M=128
-  CtaShape128x16x128B,
-  CtaShape128x32x128B,
-  CtaShape128x64x128B,
-  CtaShape128x128x128B,
-  CtaShape128x256x128B,
+  CtaShape128x16x128B = shape_tuple_to_enum(128, 16, 128),
+  CtaShape128x32x128B = shape_tuple_to_enum(128, 32, 128),
+  CtaShape128x64x128B = shape_tuple_to_enum(128, 64, 128),
+  CtaShape128x128x128B = shape_tuple_to_enum(128, 128, 128),
+  CtaShape128x256x128B = shape_tuple_to_enum(128, 256, 128),
 
   // CTA configs for M=256
-  CtaShape256x128x128B,
-  CtaShape256x256x128B,
+  CtaShape256x128x128B = shape_tuple_to_enum(256, 128, 128),
+  CtaShape256x256x128B = shape_tuple_to_enum(256, 256, 128),
 };
 
-enum class CutlassTileConfigSM100 {
+enum class CutlassTileConfigSM100 : int {
   // Signals that we should run heuristics do choose a config
-  Undefined,
+  Undefined = 0,
 
   // Signals that we should run heuristics do choose a config
-  ChooseWithHeuristic,
+  ChooseWithHeuristic = 1,
 
   /*
    * Grouped GEMM
    */
   // M=64
-  CtaShape64x32x128B,
-  CtaShape64x64x128B,
-  CtaShape64x128x128B,
-  CtaShape64x256x128B,
+  CtaShape64x32x128B = shape_tuple_to_enum(64, 32, 128),
+  CtaShape64x64x128B = shape_tuple_to_enum(64, 64, 128),
+  CtaShape64x128x128B = shape_tuple_to_enum(64, 128, 128),
+  CtaShape64x256x128B = shape_tuple_to_enum(64, 256, 128),
 
   // M=128
-  CtaShape128x8x256B,
-  CtaShape128x16x128B,
-  CtaShape128x32x128B,
-  CtaShape128x64x128B,
-  CtaShape128x128x128B,
-  CtaShape128x256x128B,
-  CtaShape128x128x256B,
-  CtaShape128x256x256B,
+  CtaShape128x8x256B = shape_tuple_to_enum(128, 8, 256),
+  CtaShape128x16x128B = shape_tuple_to_enum(128, 16, 128),
+  CtaShape128x32x128B = shape_tuple_to_enum(128, 32, 128),
+  CtaShape128x64x128B = shape_tuple_to_enum(128, 64, 128),
+  CtaShape128x128x128B = shape_tuple_to_enum(128, 128, 128),
+  CtaShape128x256x128B = shape_tuple_to_enum(128, 256, 128),
+  CtaShape128x128x256B = shape_tuple_to_enum(128, 128, 256),
+  CtaShape128x256x256B = shape_tuple_to_enum(128, 256, 256),
 
   // M=256
-  CtaShape256x64x128B,
-  CtaShape256x128x128B,
-  CtaShape256x256x128B,
+  CtaShape256x64x128B = shape_tuple_to_enum(256, 64, 128),
+  CtaShape256x128x128B = shape_tuple_to_enum(256, 128, 128),
+  CtaShape256x256x128B = shape_tuple_to_enum(256, 256, 128),
 };
 
-enum class CutlassTileConfigSM120 {
+// An alias to make the SHAPE_CASE macro work
+using CutlassTileConfigSM103 = CutlassTileConfigSM100;
+
+enum class CutlassTileConfigSM120 : int {
   // Signals that we should run heuristics do choose a config
-  Undefined,
+  Undefined = 0,
 
   // Signals that we should run heuristics do choose a config
-  ChooseWithHeuristic,
-
-  CtaShape128x128x128B,
-  CtaShape128x128x64B,
-  CtaShape256x128x64B,
-  CtaShape128x256x64B,
-  CtaShape128x128x256B,
-  CtaShape256x128x128B,
+  ChooseWithHeuristic = 1,
+
+  CtaShape128x128x128B = shape_tuple_to_enum(128, 128, 128),
+  CtaShape128x128x64B = shape_tuple_to_enum(128, 128, 64),
+  CtaShape256x128x64B = shape_tuple_to_enum(256, 128, 64),
+  CtaShape128x256x64B = shape_tuple_to_enum(128, 256, 64),
+  CtaShape128x128x256B = shape_tuple_to_enum(128, 128, 256),
+  CtaShape256x128x128B = shape_tuple_to_enum(256, 128, 128),
 };
 
 enum class MainloopScheduleType {
@@ -175,115 +198,73 @@ enum class EpilogueScheduleType {
   AUTO,  // Automatically chooses an epilogue schedule compatible with the selected main loop
          // schedule for Hopper. For architectures older than hopper, the epilogue is always
          // performed by the same thread block as the main loop.
+  NO_SMEM,
+  TMA
 };
 
-enum class TileShape {
-  TileShape_64x16x128,
-  TileShape_64x32x128,
-  TileShape_64x64x128,
-  TileShape_64x128x128,
-  TileShape_64x256x128,
-  TileShape_64x512x128,
-  TileShape_128x16x128,
-  TileShape_128x32x128,
-  TileShape_128x64x128,
-  TileShape_128x128x128,
-  TileShape_128x256x128,
-  TileShape_256x128x128,
-  TileShape_256x256x128
+enum class TileShape : int {
+  Undefined = 0,
+  TileShape_64x16x128 = shape_tuple_to_enum(64, 16, 128),
+  TileShape_64x32x128 = shape_tuple_to_enum(64, 32, 128),
+  TileShape_64x64x128 = shape_tuple_to_enum(64, 64, 128),
+  TileShape_64x128x128 = shape_tuple_to_enum(64, 128, 128),
+  TileShape_64x256x128 = shape_tuple_to_enum(64, 256, 128),
+  TileShape_64x512x128 = shape_tuple_to_enum(64, 512, 128),
+  TileShape_128x16x128 = shape_tuple_to_enum(128, 16, 128),
+  TileShape_128x32x128 = shape_tuple_to_enum(128, 32, 128),
+  TileShape_128x64x128 = shape_tuple_to_enum(128, 64, 128),
+  TileShape_128x128x128 = shape_tuple_to_enum(128, 128, 128),
+  TileShape_128x256x128 = shape_tuple_to_enum(128, 256, 128),
+  TileShape_256x128x128 = shape_tuple_to_enum(256, 128, 128),
+  TileShape_256x256x128 = shape_tuple_to_enum(256, 256, 128)
 };
 
 template <TileShape Shape_MNK>
 constexpr auto get_tile_shape() {
   using namespace cute;
-  if constexpr (Shape_MNK == TileShape::TileShape_64x16x128) {
-    return cute::Shape<_64, _16, _128>{};
-  } else if constexpr (Shape_MNK == TileShape::TileShape_64x32x128) {
-    return cute::Shape<_64, _32, _128>{};
-  } else if constexpr (Shape_MNK == TileShape::TileShape_64x64x128) {
-    return cute::Shape<_64, _64, _128>{};
-  } else if constexpr (Shape_MNK == TileShape::TileShape_64x128x128) {
-    return cute::Shape<_64, _128, _128>{};
-  } else if constexpr (Shape_MNK == TileShape::TileShape_64x256x128) {
-    return cute::Shape<_64, _256, _128>{};
-  } else if constexpr (Shape_MNK == TileShape::TileShape_64x512x128) {
-    return cute::Shape<_64, _512, _128>{};
-  } else if constexpr (Shape_MNK == TileShape::TileShape_128x16x128) {
-    return cute::Shape<_128, _16, _128>{};
-  } else if constexpr (Shape_MNK == TileShape::TileShape_128x32x128) {
-    return cute::Shape<_128, _32, _128>{};
-  } else if constexpr (Shape_MNK == TileShape::TileShape_128x64x128) {
-    return cute::Shape<_128, _64, _128>{};
-  } else if constexpr (Shape_MNK == TileShape::TileShape_128x128x128) {
-    return cute::Shape<_128, _128, _128>{};
-  } else if constexpr (Shape_MNK == TileShape::TileShape_128x256x128) {
-    return cute::Shape<_128, _256, _128>{};
-  } else if constexpr (Shape_MNK == TileShape::TileShape_256x128x128) {
-    return cute::Shape<_256, _128, _128>{};
-  } else if constexpr (Shape_MNK == TileShape::TileShape_256x256x128) {
-    return cute::Shape<_256, _256, _128>{};
-  }
+  static_assert(Shape_MNK != TileShape::Undefined, "TileShape is undefined");
+
+  constexpr auto shape_tuple = enum_to_shape_tuple(Shape_MNK);
+  return cute::Shape<cute::Int<std::get<0>(shape_tuple)>, cute::Int<std::get<1>(shape_tuple)>,
+                     cute::Int<std::get<2>(shape_tuple)>>{};
 }
 
-static auto get_tile_shape_name(TileShape Shape_MNK) {
-  if (Shape_MNK == TileShape::TileShape_64x16x128) {
-    return "64x16x128";
-  } else if (Shape_MNK == TileShape::TileShape_64x32x128) {
-    return "64x32x128";
-  } else if (Shape_MNK == TileShape::TileShape_64x64x128) {
-    return "64x64x128";
-  } else if (Shape_MNK == TileShape::TileShape_64x128x128) {
-    return "64x128x128";
-  } else if (Shape_MNK == TileShape::TileShape_64x256x128) {
-    return "64x256x128";
-  } else if (Shape_MNK == TileShape::TileShape_64x512x128) {
-    return "64x512x128";
-  } else if (Shape_MNK == TileShape::TileShape_128x16x128) {
-    return "128x16x128";
-  } else if (Shape_MNK == TileShape::TileShape_128x32x128) {
-    return "128x32x128";
-  } else if (Shape_MNK == TileShape::TileShape_128x64x128) {
-    return "128x64x128";
-  } else if (Shape_MNK == TileShape::TileShape_128x128x128) {
-    return "128x128x128";
-  } else if (Shape_MNK == TileShape::TileShape_128x256x128) {
-    return "128x256x128";
-  } else if (Shape_MNK == TileShape::TileShape_256x128x128) {
-    return "256x128x128";
-  } else if (Shape_MNK == TileShape::TileShape_256x256x128) {
-    return "256x256x128";
+template <class TEnum>
+static std::string get_tile_shape_name(TEnum Shape_MNK) {
+  static_assert(std::is_enum_v<TEnum> && std::is_same_v<std::underlying_type_t<TEnum>, int>,
+                "TEnum must be an enum with underlying type int");
+  if ((int)Shape_MNK == 0) {
+    return "undefined";
+  } else if ((int)Shape_MNK == 1) {
+    return "heuristic";
+  } else {
+    auto [m, n, k] = enum_to_shape_tuple(Shape_MNK);
+    return std::to_string(m) + "x" + std::to_string(n) + "x" + std::to_string(k);
   }
-  return "Unknown shape";
 }
 
-enum class ClusterShape {
-  ClusterShape_1x1x1,
-  ClusterShape_2x1x1,
-  ClusterShape_1x2x1,
-  ClusterShape_2x2x1,
-  ClusterShape_1x4x1,
-  ClusterShape_4x2x1,
-  ClusterShape_2x4x1,
-  ClusterShape_4x4x1,
-  ClusterShape_1x8x1,
-  ClusterShape_8x1x1
+enum class ClusterShape : int {
+  Undefined = 0,
+  ClusterShape_1x1x1 = shape_tuple_to_enum(1, 1, 1),
+  ClusterShape_2x1x1 = shape_tuple_to_enum(2, 1, 1),
+  ClusterShape_1x2x1 = shape_tuple_to_enum(1, 2, 1),
+  ClusterShape_2x2x1 = shape_tuple_to_enum(2, 2, 1),
+  ClusterShape_1x4x1 = shape_tuple_to_enum(1, 4, 1),
+  ClusterShape_4x1x1 = shape_tuple_to_enum(4, 1, 1),
+  ClusterShape_4x2x1 = shape_tuple_to_enum(4, 2, 1),
+  ClusterShape_2x4x1 = shape_tuple_to_enum(2, 4, 1),
+  ClusterShape_4x4x1 = shape_tuple_to_enum(4, 4, 1),
+  ClusterShape_1x8x1 = shape_tuple_to_enum(1, 8, 1),
+  ClusterShape_8x1x1 = shape_tuple_to_enum(8, 1, 1)
 };
 
-static auto get_cluster_shape_name(ClusterShape Shape_MNK) {
-  if (Shape_MNK == ClusterShape::ClusterShape_1x1x1) {
-    return "1x1x1";
-  } else if (Shape_MNK == ClusterShape::ClusterShape_2x1x1) {
-    return "2x1x1";
-  } else if (Shape_MNK == ClusterShape::ClusterShape_1x2x1) {
-    return "1x2x1";
-  } else if (Shape_MNK == ClusterShape::ClusterShape_2x2x1) {
-    return "2x2x1";
-  } else if (Shape_MNK == ClusterShape::ClusterShape_1x8x1) {
-    return "1x8x1";
-  } else if (Shape_MNK == ClusterShape::ClusterShape_8x1x1) {
-    return "8x1x1";
+static std::string get_cluster_shape_name(ClusterShape Shape_MNK) {
+  if (Shape_MNK == ClusterShape::Undefined) {
+    return "undefined";
+  } else {
+    auto [m, n, k] = enum_to_shape_tuple(Shape_MNK);
+    return std::to_string(m) + "x" + std::to_string(n) + "x" + std::to_string(k);
   }
-  return "Unknown shape";
 }
 
 template <ClusterShape Shape_MNK>
@@ -297,10 +278,22 @@ constexpr auto get_cluster_shape() {
     return cute::Shape<_1, _2, _1>{};
   } else if constexpr (Shape_MNK == ClusterShape::ClusterShape_2x2x1) {
     return cute::Shape<_2, _2, _1>{};
+  } else if constexpr (Shape_MNK == ClusterShape::ClusterShape_4x1x1) {
+    return cute::Shape<_4, _1, _1>{};
   } else if constexpr (Shape_MNK == ClusterShape::ClusterShape_1x8x1) {
     return cute::Shape<_1, _8, _1>{};
   } else if constexpr (Shape_MNK == ClusterShape::ClusterShape_8x1x1) {
     return cute::Shape<_8, _1, _1>{};
+  } else if constexpr (Shape_MNK == ClusterShape::ClusterShape_1x4x1) {
+    return cute::Shape<_1, _4, _1>{};
+  } else if constexpr (Shape_MNK == ClusterShape::ClusterShape_4x2x1) {
+    return cute::Shape<_4, _2, _1>{};
+  } else if constexpr (Shape_MNK == ClusterShape::ClusterShape_2x4x1) {
+    return cute::Shape<_2, _4, _1>{};
+  } else if constexpr (Shape_MNK == ClusterShape::ClusterShape_4x4x1) {
+    return cute::Shape<_4, _4, _1>{};
+  } else {
+    return cute::Shape<_0, _0, _0>{};
   }
 }
 
@@ -314,7 +307,8 @@ struct CutlassGemmConfig {
     BLACKWELL = 1u << 4,
     GROUPED_GEMM = 1u << 5,
     FP8_ONLY = 1u << 6,
-    FP4_ONLY = 1u << 7
+    FP4_ONLY = 1u << 7,
+    FP8FP4_MIXED = 1u << 8
   };
 
   CutlassTileConfig tile_config_sm80 = CutlassTileConfig::ChooseWithHeuristic;
@@ -329,10 +323,17 @@ struct CutlassGemmConfig {
   MainloopScheduleType mainloop_schedule = MainloopScheduleType::AUTO;
   EpilogueScheduleType epilogue_schedule = EpilogueScheduleType::AUTO;
   ClusterShape cluster_shape = ClusterShape::ClusterShape_1x1x1;
+  ClusterShape dynamic_cluster_shape = ClusterShape::Undefined;
+  ClusterShape fallback_cluster_shape = ClusterShape::Undefined;
   bool enableCudaKernel = false;
   int sm_version = 80;  // Use 80 as a catch all for <90
   bool is_tma_warp_specialized = false;
 
+  enum class EpilogueFusionType : int { NONE, FINALIZE };
+
+  EpilogueFusionType epilogue_fusion_type = EpilogueFusionType::NONE;
+  bool swap_ab = false;
+
   CutlassGemmConfig() = default;
 
   CutlassGemmConfig(CutlassTileConfig tile_config, SplitKStyle split_k_style, int split_k_factor,
@@ -352,15 +353,24 @@ struct CutlassGemmConfig {
         sm_version(90),
         is_tma_warp_specialized(true) {}
 
+  // If dynamic_cluster_shape is provided, dynamic CGA will be enabled and cluster_shape will be
+  // interpreted as whether to use 1 or 2 SM mode, otherwise static cluster shape is used.
   CutlassGemmConfig(CutlassTileConfigSM100 tile_config_sm100,
                     MainloopScheduleType mainloop_schedule, EpilogueScheduleType epilogue_schedule,
-                    ClusterShape cluster_shape)
+                    ClusterShape cluster_shape,
+                    ClusterShape dynamic_cluster_shape = ClusterShape::Undefined,
+                    ClusterShape fallback_cluster_shape = ClusterShape::Undefined,
+                    int sm_version = 100)
       : tile_config_sm100(tile_config_sm100),
         mainloop_schedule(mainloop_schedule),
         epilogue_schedule(epilogue_schedule),
         cluster_shape(cluster_shape),
-        sm_version(100),
-        is_tma_warp_specialized(true) {}
+        dynamic_cluster_shape(dynamic_cluster_shape),
+        fallback_cluster_shape(fallback_cluster_shape),
+        sm_version(sm_version),
+        is_tma_warp_specialized(true) {
+    TLLM_CHECK_WITH_INFO(sm_version >= 100 && sm_version < 120, "Expected SM 10x version");
+  }
 
   CutlassGemmConfig(CutlassTileConfigSM120 tile_config_sm120,
                     MainloopScheduleType mainloop_schedule, EpilogueScheduleType epilogue_schedule,
@@ -373,26 +383,38 @@ struct CutlassGemmConfig {
         is_tma_warp_specialized(true) {}
 
   int getTileConfigAsInt() const {
-    if (sm_version == 120) return (int)tile_config_sm120;
-    if (sm_version == 110) return (int)tile_config_sm100;
-    if (sm_version >= 100) return (int)tile_config_sm100;
+    if (sm_version == 120 || sm_version == 121) return (int)tile_config_sm120;
+    if (sm_version >= 100 && sm_version < 120) return (int)tile_config_sm100;
     if (sm_version == 90) return (int)tile_config_sm90;
     if (sm_version < 90) return (int)tile_config_sm80;
     assert(false && "Invalid SM version");
     return -1;
   }
 
+  std::string getTileConfigAsName() const {
+    if (sm_version == 120 || sm_version == 121) return get_tile_shape_name(tile_config_sm120);
+    if (sm_version >= 100 && sm_version < 120) return get_tile_shape_name(tile_config_sm100);
+    if (sm_version == 90) return get_tile_shape_name(tile_config_sm90);
+    if (sm_version < 90) return std::to_string((int)tile_config_sm80);
+    assert(false && "Invalid SM version");
+    return "invalid";
+  }
+
   std::string toString() const {
     std::stringstream tactic;
     tactic << "Cutlass GEMM Tactic";
     if (is_tma_warp_specialized) {
       assert(sm_version >= 90 && "Invalid cutlass GEMM config");
       tactic << "\n\tstyle=TMA Warp Specialized"
-             << "\n\tsm: " << sm_version << "\n\ttile shape ID: " << getTileConfigAsInt()
-             << "\n\tcluster shape ID: " << (int)cluster_shape
+             << "\n\tsm: " << sm_version << "\n\ttile shape ID: " << getTileConfigAsName()
+             << "\n\tcluster shape ID: " << get_cluster_shape_name(cluster_shape)
+             << "\n\tdynamic cluster shape ID: " << get_cluster_shape_name(dynamic_cluster_shape)
+             << "\n\tfallback cluster shape ID: " << get_cluster_shape_name(fallback_cluster_shape)
              << "\n\tmainloop sched: " << (int)mainloop_schedule
              << "\n\tepi sched: " << (int)epilogue_schedule
-             << "\n\tenable cuda kernel: " << (enableCudaKernel ? "true" : "false");
+             << "\n\tenable cuda kernel: " << (enableCudaKernel ? "true" : "false")
+             << "\n\tepilogue fusion type: " << (int)epilogue_fusion_type
+             << "\n\tswap_ab: " << (swap_ab ? "true" : "false");
     } else if (tile_config_sm80 !=
                tensorrt_llm::cutlass_extensions::CutlassTileConfig::ChooseWithHeuristic) {
       assert(sm_version < 90 && "Invalid cutlass GEMM config");
@@ -412,22 +434,26 @@ struct CutlassGemmConfig {
 
 inline std::ostream& operator<<(std::ostream& out, CutlassGemmConfig const& config) {
   // clang-format off
-     if (config.is_tma_warp_specialized)
-     {
-         out << "tile_config_sm90_enum: " << config.getTileConfigAsInt()
-             << ", mainloop_schedule_enum: " << int(config.mainloop_schedule)
-             << ", epilogue_schedule_enum: " << int(config.epilogue_schedule)
-             << ", cluster_shape_enum: " << int(config.cluster_shape)
-             << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false");
-     }
-     else
-     {
-         out << "tile_config_enum: " << config.getTileConfigAsInt()
-             << ", split_k_style_enum: " << int(config.split_k_style)
-             << ", split_k_factor: " << config.split_k_factor
-             << ", stages: " << config.stages
-             << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false");
-     }
+    if (config.is_tma_warp_specialized)
+    {
+        out << "tile_config_sm90_enum: " << config.getTileConfigAsInt()
+            << ", mainloop_schedule_enum: " << int(config.mainloop_schedule)
+            << ", epilogue_schedule_enum: " << int(config.epilogue_schedule)
+            << ", cluster_shape_enum: " << int(config.cluster_shape)
+            << ", dynamic_cluster_shape_enum: " << int(config.dynamic_cluster_shape)
+            << ", fallback_cluster_shape_enum: " << int(config.fallback_cluster_shape)
+            << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false")
+            << ", epilogue_fusion_type: " << int(config.epilogue_fusion_type)
+            << ", swap_ab: " << (config.swap_ab ? "true" : "false");
+    }
+    else
+    {
+        out << "tile_config_enum: " << config.getTileConfigAsInt()
+            << ", split_k_style_enum: " << int(config.split_k_style)
+            << ", split_k_factor: " << config.split_k_factor
+            << ", stages: " << config.stages
+            << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false");
+    }
   // clang-format on
   return out;
 }
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/util/gather_tensor.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/util/gather_tensor.hpp
index 4ba4fc9f20..5a3b5f2302 100644
--- a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/util/gather_tensor.hpp
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/util/gather_tensor.hpp
@@ -34,7 +34,7 @@
 #include "cute/tensor.hpp"
 #include "cute/util/print.hpp"
 
-using namespace cute;
+namespace cutlass::util {
 
 /// Function object that applies an index to its argument
 template <class Iter>
@@ -48,8 +48,8 @@ struct IndexedGather {
 
   CUTE_HOST_DEVICE friend void print(IndexedGather const& s) {
     cute::print("Indexed{");
-    print(s.indices_);
-    print("}");
+    cute::print(s.indices_);
+    cute::print("}");
   }
 
   Iter indices_;
@@ -73,23 +73,23 @@ struct CustomStride {
 
   CUTE_HOST_DEVICE friend void print(CustomStride const& s) {
     cute::print("Custom{");
-    print(s.func_);
+    cute::print(s.func_);
     cute::print(",");
-    print(s.stride_);
+    cute::print(s.stride_);
     cute::print("}");
   }
 
   template <class Div>
   CUTE_HOST_DEVICE constexpr friend auto safe_div(CustomStride const& s, Div const& div) {
-    return CustomStride<Func, decltype(safe_div(s.stride_, div))>(s.func_,
-                                                                  safe_div(s.stride_, div));
+    return CustomStride<Func, decltype(cute::safe_div(s.stride_, div))>(
+        s.func_, cute::safe_div(s.stride_, div));
   }
 
   // Circumvent the requirement on make_layout that shape and stride are integral
   template <class Shape>
   CUTE_HOST_DEVICE constexpr friend auto make_layout(Shape const& shape,
                                                      CustomStride const& stride) {
-    return Layout<Shape, CustomStride>(shape, stride);
+    return cute::Layout<Shape, CustomStride>(shape, stride);
   }
 
   Func func_;
@@ -98,6 +98,7 @@ struct CustomStride {
 
 template <class Stride, class Func>
 CUTLASS_HOST_DEVICE auto make_custom_stride_layout(Stride const& stride, Func&& func) {
+  using namespace cute;
   // Use a dummy shape and replace the first non-unit and non-zero stride with a custom gather
   // stride
   auto idx = find_if(stride, [](auto x) {
@@ -112,11 +113,13 @@ CUTLASS_HOST_DEVICE auto make_custom_stride_layout(Stride const& stride, Func&&
 template <class Iterator, class Shape, class Stride, class Func>
 CUTLASS_HOST_DEVICE auto make_gather_tensor(Iterator iter, Shape const& shape, Stride const& stride,
                                             Func&& func) {
+  using namespace cute;
   Layout matrix_layout = make_identity_layout(shape);
   auto offset = as_arithmetic_tuple(repeat_like(shape, _0{}));
   Layout gather_layout = make_custom_stride_layout(stride, static_cast<Func&&>(func));
   return make_tensor(iter, ComposedLayout{gather_layout, offset, matrix_layout});
 }
+}  // namespace cutlass::util
 
 namespace cute {
 
diff --git a/csrc/nv_internal/tensorrt_llm/deep_gemm/compiler.cuh b/csrc/nv_internal/tensorrt_llm/deep_gemm/compiler.cuh
index 9222bf19d2..fbdc902972 100644
--- a/csrc/nv_internal/tensorrt_llm/deep_gemm/compiler.cuh
+++ b/csrc/nv_internal/tensorrt_llm/deep_gemm/compiler.cuh
@@ -36,6 +36,9 @@
 #include "nvrtc.h"
 #include "runtime.cuh"
 #include "scheduler.cuh"
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/logger.h"
 
 #ifdef _WIN32
 #include <windows.h>
@@ -44,7 +47,7 @@
 namespace deep_gemm::jit {
 
 // Generate a unique ID for temporary directories to avoid collisions
-std::string generateUniqueId() {
+inline std::string generateUniqueId() {
   // Use current time and random number to generate a unique ID
   static std::mt19937 gen(std::random_device{}());
   static std::uniform_int_distribution<> distrib(0, 999999);
@@ -59,7 +62,7 @@ std::string generateUniqueId() {
   return std::to_string(value) + "_" + std::to_string(random_value);
 }
 
-std::filesystem::path getDefaultUserDir() {
+inline std::filesystem::path getDefaultUserDir() {
   static std::filesystem::path userDir;
   if (userDir.empty()) {
     char const* cacheDir = getenv("TRTLLM_DG_CACHE_DIR");
@@ -91,7 +94,7 @@ inline std::filesystem::path getTmpDir() { return getDefaultUserDir() / "tmp"; }
 
 inline std::filesystem::path getCacheDir() { return getDefaultUserDir() / "cache"; }
 
-std::string getNvccCompiler() {
+inline std::string getNvccCompiler() {
   static std::string compiler;
   if (compiler.empty()) {
     // Check environment variable
@@ -121,74 +124,21 @@ std::string getNvccCompiler() {
   return compiler;
 }
 
-std::vector<std::filesystem::path> getJitIncludeDirs() {
+inline std::vector<std::filesystem::path>& getJitIncludeDirs() {
   static std::vector<std::filesystem::path> includeDirs;
-  if (includeDirs.empty()) {
-    // Command to execute
-    char const* cmd = "pip show flashinfer-python 2>/dev/null";
-
-    // Buffer to store the output
-    std::array<char, 128> buffer;
-    std::string result;
-
-// Open pipe to command
-#ifdef _MSC_VER
-    FILE* pipe = _popen(cmd, "r");
-#else
-    FILE* pipe = popen(cmd, "r");
-#endif
-
-    if (pipe) {
-      // Read the output
-      while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
-        result += buffer.data();
-      }
-
-// Close the pipe
-#ifdef _MSC_VER
-      _pclose(pipe);
-#else
-      pclose(pipe);
-#endif
-
-      // Parse the location using regex
-      // `pip show tensorrt_llm` will output something like:
-      // Location: /usr/local/lib/python3.12/dist-packages
-      // Editable project location: /code
-      std::regex locationRegex("(Location|Editable project location): (.+)");
-
-      // Find all matches
-      auto match_begin = std::sregex_iterator(result.begin(), result.end(), locationRegex);
-      auto match_end = std::sregex_iterator();
-
-      // Get the number of matches
-      auto match_count = std::distance(match_begin, match_end);
-
-      if (match_count > 0) {
-        // Get the last match
-        auto last_match_iter = match_begin;
-        std::advance(last_match_iter, match_count - 1);
-
-        // Get the path from the second capture group
-        std::string location = last_match_iter->str(2);
-        location.erase(location.find_last_not_of(" \n\r\t") + 1);
-
-        // Set the include directory based on the package location
-        includeDirs.push_back(std::filesystem::path(location) / "flashinfer" / "data" / "csrc" /
-                              "nv_internal" / "tensorrt_llm");
-      }
-    } else {
-      TLLM_LOG_WARNING("Failed to find FlashInfer installation, DeepGEMM will be disabled.");
-    }
-  }
   return includeDirs;
 }
 
-std::string generateKernel(uint32_t const shape_n, uint32_t const shape_k, uint32_t const block_m,
-                           uint32_t const block_n, uint32_t const block_k,
-                           uint32_t const num_groups, uint32_t const num_stages,
-                           uint32_t const num_tma_multicast, deep_gemm::GemmType const gemm_type,
-                           bool swapAB = false) {
+inline void setJitIncludeDirs(std::vector<std::filesystem::path> const& dirs) {
+  static std::vector<std::filesystem::path>& includeDirs = getJitIncludeDirs();
+  includeDirs = dirs;
+}
+
+inline std::string generateKernel(uint32_t const shape_n, uint32_t const shape_k,
+                                  uint32_t const block_m, uint32_t const block_n,
+                                  uint32_t const block_k, uint32_t const num_groups,
+                                  uint32_t const num_stages, uint32_t const num_tma_multicast,
+                                  deep_gemm::GemmType const gemm_type, bool swapAB = false) {
   constexpr uint32_t kNumTMAThreads = 128;
   constexpr uint32_t kNumMathThreadsPerGroup = 128;
 
@@ -288,7 +238,12 @@ class Compiler {
     return instance;
   }
 
-  [[nodiscard]] bool isValid() const { return !includeDirs_.empty(); }
+  [[nodiscard]] bool isValid() const { return !getJitIncludeDirs().empty(); }
+
+  // Set include directories before the singleton is initialized
+  static void setIncludeDirs(std::vector<std::filesystem::path> const& dirs) {
+    setJitIncludeDirs(dirs);
+  }
 
   // Build function
   Runtime* build(uint32_t const shape_n, uint32_t const shape_k, uint32_t const block_m,
@@ -361,7 +316,7 @@ class Compiler {
       std::filesystem::create_directories(path);
     }
 
-    for (auto const& dir : includeDirs_) {
+    for (auto const& dir : getJitIncludeDirs()) {
       flags.push_back("-I" + dir.string());
     }
 
@@ -517,10 +472,8 @@ class Compiler {
   }
 
  private:
-  std::vector<std::filesystem::path> includeDirs_;
-
   // Private constructor for singleton pattern
-  Compiler() : includeDirs_(getJitIncludeDirs()) {
+  Compiler() {
     // Create necessary directories
     if (kJitUseNvcc || kJitDumpCubin) {
       std::filesystem::create_directories(getTmpDir());
diff --git a/csrc/nv_internal/tensorrt_llm/deep_gemm/jit_utils.cuh b/csrc/nv_internal/tensorrt_llm/deep_gemm/jit_utils.cuh
index 25c47eb8f6..8e26486d4d 100644
--- a/csrc/nv_internal/tensorrt_llm/deep_gemm/jit_utils.cuh
+++ b/csrc/nv_internal/tensorrt_llm/deep_gemm/jit_utils.cuh
@@ -16,6 +16,7 @@
  */
 
 #pragma once
+#include <cuda.h>
 #include <cuda_runtime.h>
 #include <nvrtc.h>
 
@@ -67,7 +68,7 @@ GemmConfig get_best_gemm_config(uint32_t shape_m, uint32_t shape_n, uint32_t sha
 
 namespace deep_gemm::jit {
 
-std::string gemm_type_to_string(deep_gemm::GemmType gemm_type) {
+inline std::string gemm_type_to_string(deep_gemm::GemmType gemm_type) {
   switch (gemm_type) {
     case deep_gemm::GemmType::Normal:
       return std::string("Normal");
@@ -85,10 +86,10 @@ std::string gemm_type_to_string(deep_gemm::GemmType gemm_type) {
   }
 }
 
-int div_up(int a, int b) { return (a + b - 1) / b; }
+inline int div_up(int a, int b) { return (a + b - 1) / b; }
 
-int get_smem_size(int num_stages, int k, int block_m, int block_n, int block_k = 128,
-                  bool swap_ab = false) {
+inline int get_smem_size(int num_stages, int k, int block_m, int block_n, int block_k = 128,
+                         bool swap_ab = false) {
   if (!swap_ab) {
     int smem_d = block_m * block_n * 2;
     int smem_a_per_stage = block_m * block_k;
@@ -126,16 +127,16 @@ int get_smem_size(int num_stages, int k, int block_m, int block_n, int block_k =
   }
 }
 
-bool is_tma_multicast_legal(int n, int block_n, int num_tma_multicast, int num_sms) {
+inline bool is_tma_multicast_legal(int n, int block_n, int num_tma_multicast, int num_sms) {
   if (num_tma_multicast == 1) {
     return true;
   }
   return (n % (block_n * num_tma_multicast) == 0) && num_sms % num_tma_multicast == 0;
 }
 
-GemmConfig get_best_gemm_config(uint32_t shape_m, uint32_t shape_n, uint32_t shape_k,
-                                int num_groups, int num_device_sms,
-                                bool is_grouped_contiguous = false, bool swap_ab = false) {
+inline GemmConfig get_best_gemm_config(uint32_t shape_m, uint32_t shape_n, uint32_t shape_k,
+                                       int num_groups, int num_device_sms,
+                                       bool is_grouped_contiguous = false, bool swap_ab = false) {
   // Choose candidate block sizes
   std::vector<int> block_ms;
   block_ms.push_back((!is_grouped_contiguous && shape_m <= 64) ? 64 : 128);
diff --git a/csrc/nv_internal/tensorrt_llm/deep_gemm/runtime.cuh b/csrc/nv_internal/tensorrt_llm/deep_gemm/runtime.cuh
index f4e6ab124e..f960af632d 100644
--- a/csrc/nv_internal/tensorrt_llm/deep_gemm/runtime.cuh
+++ b/csrc/nv_internal/tensorrt_llm/deep_gemm/runtime.cuh
@@ -181,6 +181,6 @@ class RuntimeCache {
 };
 
 // Global function to access the singleton
-RuntimeCache& getGlobalRuntimeCache() { return RuntimeCache::getInstance(); }
+inline RuntimeCache& getGlobalRuntimeCache() { return RuntimeCache::getInstance(); }
 
 }  // namespace deep_gemm::jit
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
index 8fc256ba31..aa534d0dd9 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
@@ -158,7 +158,7 @@ std::vector<CutlassTileConfig> get_candidate_tiles(
               CutlassTileConfig::CtaShape256x128x64_WarpShape64x64x64};
     case CutlassGemmType::Fp8:
       if (config_type_param & CutlassGemmConfig::GROUPED_GEMM) {
-        if (sm == 89 || sm == 120) {
+        if (sm == 89 || sm == 120 || sm == 121) {
           return {CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64,
                   CutlassTileConfig::CtaShape64x128x64_WarpShape64x32x64,
                   CutlassTileConfig::CtaShape64x64x128_WarpShape32x64x64,
@@ -264,6 +264,119 @@ bool sm90_supports_mcast_along_n(CutlassTileConfigSM90 const tile) {
 #endif
 }
 
+std::vector<CutlassGemmConfig> get_candidate_configs_sm100_dynamic_cluster_shape(
+    int sm, CutlassGemmConfig::CandidateConfigTypeParam const config, EpilogueScheduleType schedule,
+    ClusterShape const dynamic_cluster_shape, ClusterShape const fallback_cluster_shape) {
+  auto cluster1sm = ClusterShape::ClusterShape_1x1x1;
+  auto cluster2sm = ClusterShape::ClusterShape_2x1x1;
+  bool supports_2sm = dynamic_cluster_shape == ClusterShape::Undefined ||
+                      std::get<0>(enum_to_shape_tuple(dynamic_cluster_shape)) % 2 == 0;
+
+  std::vector<CutlassGemmConfig> candidate_configs;
+  if ((config & CutlassGemmConfig::FP4_ONLY) != 0) {
+    if (sm == 100) {
+      if (schedule != EpilogueScheduleType::TMA) return {};
+      candidate_configs.push_back(CutlassGemmConfig{
+          CutlassTileConfigSM100::CtaShape128x64x128B, MainloopScheduleType::AUTO, schedule,
+          cluster1sm, dynamic_cluster_shape, fallback_cluster_shape, sm});
+      if (supports_2sm) {
+        candidate_configs.push_back(CutlassGemmConfig{
+            CutlassTileConfigSM100::CtaShape128x64x128B, MainloopScheduleType::AUTO, schedule,
+            cluster2sm, dynamic_cluster_shape, fallback_cluster_shape, sm});
+      }
+    }
+
+    candidate_configs.push_back(
+        CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x128x128B, MainloopScheduleType::AUTO,
+                          schedule, cluster1sm, dynamic_cluster_shape, fallback_cluster_shape, sm});
+    candidate_configs.push_back(
+        CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x256x128B, MainloopScheduleType::AUTO,
+                          schedule, cluster1sm, dynamic_cluster_shape, fallback_cluster_shape, sm});
+    if (supports_2sm) {
+      candidate_configs.push_back(CutlassGemmConfig{
+          CutlassTileConfigSM100::CtaShape128x128x128B, MainloopScheduleType::AUTO, schedule,
+          cluster2sm, dynamic_cluster_shape, fallback_cluster_shape, sm});
+      candidate_configs.push_back(CutlassGemmConfig{
+          CutlassTileConfigSM100::CtaShape128x256x128B, MainloopScheduleType::AUTO, schedule,
+          cluster2sm, dynamic_cluster_shape, fallback_cluster_shape, sm});
+    }
+    return candidate_configs;
+  }
+
+  std::vector<std::pair<CutlassTileConfigSM100, ClusterShape>> tile_configs{
+      {CutlassTileConfigSM100::CtaShape128x128x128B, cluster1sm},
+      {CutlassTileConfigSM100::CtaShape128x256x128B, cluster1sm},
+      {CutlassTileConfigSM100::CtaShape128x32x128B, cluster1sm},
+      {CutlassTileConfigSM100::CtaShape64x64x128B, cluster1sm},
+      {CutlassTileConfigSM100::CtaShape64x32x128B, cluster1sm},
+      {CutlassTileConfigSM100::CtaShape64x128x128B, cluster1sm},
+      {CutlassTileConfigSM100::CtaShape64x256x128B, cluster1sm},
+      {CutlassTileConfigSM100::CtaShape128x64x128B, cluster1sm},
+  };
+
+  if (supports_2sm) {
+    tile_configs.push_back({CutlassTileConfigSM100::CtaShape64x128x128B, cluster2sm});
+    tile_configs.push_back({CutlassTileConfigSM100::CtaShape64x256x128B, cluster2sm});
+    tile_configs.push_back({CutlassTileConfigSM100::CtaShape64x64x128B, cluster2sm});
+    tile_configs.push_back({CutlassTileConfigSM100::CtaShape128x64x128B, cluster2sm});
+    tile_configs.push_back({CutlassTileConfigSM100::CtaShape128x128x128B, cluster2sm});
+    tile_configs.push_back({CutlassTileConfigSM100::CtaShape128x256x128B, cluster2sm});
+  }
+
+  if (config & CutlassGemmConfig::FP8_ONLY) {
+    tile_configs.push_back({CutlassTileConfigSM100::CtaShape128x16x128B, cluster1sm});
+    // TODO: re-enable when handled by the MoE GEMM dispatch
+    // tile_configs.push_back({ CutlassTileConfigSM100::CtaShape128x8x256B,
+    // ClusterShape::ClusterShape_1x1x1 });
+  }
+
+  for (auto [tile, cluster] : tile_configs) {
+    CutlassGemmConfig config{tile,    MainloopScheduleType::AUTO, schedule,
+                             cluster, dynamic_cluster_shape,      fallback_cluster_shape,
+                             sm};
+    candidate_configs.push_back(config);
+  }
+  return candidate_configs;
+}
+
+std::vector<CutlassGemmConfig> get_candidate_configs_sm100(
+    CutlassGemmConfig::CandidateConfigTypeParam const config, int sm) {
+#ifdef FAST_BUILD
+  // Fast build disables all configs except this one for SM100
+  return {CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x128x128B,
+                            MainloopScheduleType::AUTO, EpilogueScheduleType::TMA,
+                            ClusterShape::ClusterShape_1x1x1, ClusterShape::Undefined,
+                            ClusterShape::Undefined, sm}};
+#else
+  if (config & CutlassGemmConfig::GROUPED_GEMM) {
+    std::vector<CutlassGemmConfig> candidate_configs;
+    for (auto schedule : {EpilogueScheduleType::TMA, EpilogueScheduleType::NO_SMEM}) {
+      // TODO The tactic profiling is a bit long with all of these shapes enabled
+      //   Shape 4x4x1 shapes do not seem to give better performance in the cases I tested so we
+      //   disable it here
+      auto cluster_shapes = {
+          ClusterShape::ClusterShape_1x1x1, ClusterShape::ClusterShape_4x1x1,
+          ClusterShape::ClusterShape_4x2x1 /*, ClusterShape::ClusterShape_4x4x1*/};
+      for (auto cluster_shape : cluster_shapes) {
+        auto fallback_cluster_shape = cluster_shape == ClusterShape::ClusterShape_1x1x1
+                                          ? ClusterShape::ClusterShape_1x1x1
+                                          : ClusterShape::ClusterShape_2x1x1;
+        auto configs = get_candidate_configs_sm100_dynamic_cluster_shape(
+            sm, config, schedule, cluster_shape, fallback_cluster_shape);
+        candidate_configs.insert(candidate_configs.end(), configs.begin(), configs.end());
+      }
+
+      auto configs = get_candidate_configs_sm100_dynamic_cluster_shape(
+          sm, config, schedule, ClusterShape::Undefined, ClusterShape::Undefined);
+      candidate_configs.insert(candidate_configs.end(), configs.begin(), configs.end());
+    }
+    return candidate_configs;
+  } else {
+    TLLM_THROW("Not Implemented: SM100 GEMM candidates have not been defined.");
+  }
+#endif
+}
+
 std::vector<CutlassGemmConfig> get_candidate_configs_sm90(
     CutlassGemmConfig::CandidateConfigTypeParam const config) {
   auto tiles = get_candidate_tiles_sm90(config);
@@ -330,7 +443,7 @@ std::vector<CutlassGemmConfig> get_candidate_configs_sm90(
   return candidate_configs;
 }
 
-std::vector<CutlassGemmConfig> get_candidate_configs_sm100(
+/*std::vector<CutlassGemmConfig> get_candidate_configs_sm100(
     CutlassGemmConfig::CandidateConfigTypeParam const config) {
 #ifdef FAST_BUILD
   // Fast build disables all configs except this one for SM100
@@ -413,14 +526,14 @@ std::vector<CutlassGemmConfig> get_candidate_configs_sm100(
     TLLM_THROW("Not Implemented: SM100 GEMM candidates have not been defined.");
   }
 #endif
-}
+}*/
 
 std::vector<CutlassGemmConfig> get_candidate_configs_sm110(
     CutlassGemmConfig::CandidateConfigTypeParam const config) {
 #ifdef FAST_BUILD
   // Fast build disables all configs except this
   return {CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x128x128B,
-                            MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO,
+                            MainloopScheduleType::AUTO, EpilogueScheduleType::TMA,
                             ClusterShape::ClusterShape_1x1x1}};
 #else
   std::vector<CutlassGemmConfig> candidate_configs;
@@ -461,7 +574,7 @@ std::vector<CutlassGemmConfig> get_candidate_configs_sm110(
           std::array{ClusterShape::ClusterShape_2x1x1, ClusterShape::ClusterShape_2x2x1}};
       auto cluster = cluster_shapes[cluster_m - 1][cluster_n - 1];
       for (auto tile : base) {
-        CutlassGemmConfig config{tile, MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO,
+        CutlassGemmConfig config{tile, MainloopScheduleType::AUTO, EpilogueScheduleType::TMA,
                                  cluster};
         candidate_configs.push_back(config);
       }
@@ -538,7 +651,7 @@ std::vector<CutlassGemmConfig> get_candidate_configs(
     return get_candidate_configs_sm110(config_type_param);
   }
   if (sm >= 100 && sm < 120 && (config_type_param & CutlassGemmConfig::BLACKWELL)) {
-    return get_candidate_configs_sm100(config_type_param);
+    return get_candidate_configs_sm100(config_type_param, sm);
   }
   if (sm >= 120 && (config_type_param & CutlassGemmConfig::BLACKWELL)) {
     return get_candidate_configs_sm120(config_type_param);
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h
index f7ea83cdb0..80c024bdb7 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h
@@ -24,7 +24,8 @@ namespace tensorrt_llm {
 namespace kernels {
 namespace cutlass_kernels {
 
-template <class ArchTag, class TileShape, class ClusterShape, class ActivationType>
+template <class ArchTag, class TileShape, class ClusterShape, bool DYNAMIC_CGA,
+          class ActivationType>
 struct should_filter_tma_warp_specialized_gemm_problem_shape {
 #ifdef FAST_BUILD
   using SupportedCtaShape =
@@ -32,15 +33,16 @@ struct should_filter_tma_warp_specialized_gemm_problem_shape {
   using SupportedCgaShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
 
   constexpr static bool value = !cute::is_same_v<SupportedCtaShape, TileShape> ||
-                                !cute::is_same_v<SupportedCgaShape, ClusterShape>;
+                                !cute::is_same_v<SupportedCgaShape, ClusterShape> || DYNAMIC_CGA;
 #else
   constexpr static bool value = false;
 #endif
 };
-template <class ArchTag, class TileShape, class ClusterShape, class ActivationType>
+template <class ArchTag, class TileShape, class ClusterShape, bool DYNAMIC_CGA,
+          class ActivationType>
 constexpr static bool should_filter_tma_warp_specialized_gemm_problem_shape_v =
     should_filter_tma_warp_specialized_gemm_problem_shape<ArchTag, TileShape, ClusterShape,
-                                                          ActivationType>::value;
+                                                          DYNAMIC_CGA, ActivationType>::value;
 
 std::vector<tensorrt_llm::cutlass_extensions::CutlassGemmConfig> get_candidate_configs(
     int sm, int const max_split_k,
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
index 14ba601b39..2cc10e382b 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
@@ -46,6 +46,7 @@ namespace tkc = tensorrt_llm::cutlass_extensions;
 namespace tensorrt_llm {
 namespace kernels {
 namespace cutlass_kernels {
+using namespace cute;
 
 template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
           typename OutputType, typename arch, cutlass::WeightOnlyQuantOp QuantOp,
@@ -55,6 +56,8 @@ void generic_mixed_gemm_kernelLauncher(
     ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha,
     OutputType* C, int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemm_config,
     char* workspace, size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+
 #ifdef ENABLE_BF16
   static_assert(
 #ifdef ENABLE_FP8
@@ -198,21 +201,21 @@ void generic_mixed_gemm_kernelLauncher(
   if (can_implement != cutlass::Status::kSuccess) {
     std::string err_msg = "fpA_intB cutlass kernel will fail for params. Error: " +
                           std::string(cutlassGetStatusString(can_implement));
-    throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+    throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
   }
 
   auto init_status = gemm.initialize(args, workspace, stream);
   if (init_status != cutlass::Status::kSuccess) {
     std::string err_msg = "Failed to initialize cutlass fpA_intB gemm. Error: " +
                           std::string(cutlassGetStatusString(init_status));
-    throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+    throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
   }
 
   auto run_status = gemm.run(stream);
   if (run_status != cutlass::Status::kSuccess) {
     std::string err_msg = "Failed to run cutlass fpA_intB gemm. Error: " +
                           std::string(cutlassGetStatusString(run_status));
-    throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+    throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
   }
 }
 
@@ -229,25 +232,26 @@ void filter_and_run_mixed_gemm(ActivationType const* A, WeightType const* B,
                                int const group_size, tkc::CutlassGemmConfig gemm_config,
                                char* workspace, size_t workspace_bytes, cudaStream_t stream,
                                int* occupancy = nullptr) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   if constexpr (Stages > 2 && arch::kMinComputeCapability < 80) {
     // Multistage only supported on Ampere
     std::string err_msg = "Cutlass fpA_intB gemm not supported for arch " +
                           std::to_string(arch::kMinComputeCapability) + " with stages set to " +
                           std::to_string(Stages);
-    throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg);
+    throw std::runtime_error("[TensorRT LLM Error][filter_and_run_mixed_gemm] " + err_msg);
   } else if constexpr (Stages == 2 && arch::kMinComputeCapability >= 89) {
     // Multistage only supported on Ampere
     std::string err_msg = "Cutlass fpA_intB gemm not supported for arch " +
                           std::to_string(arch::kMinComputeCapability) + " with stages set to " +
                           std::to_string(Stages);
-    throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg);
+    throw std::runtime_error("[TensorRT LLM Error][filter_and_run_mixed_gemm] " + err_msg);
   } else if constexpr (cutlass::platform::is_same<ActivationType, __nv_fp8_e4m3>::value &&
                        arch::kMinComputeCapability < 89) {
     // FP8 activation type only supported on Ada+ GPUs
     std::string err_msg = "Cutlass fpA_intB gemm not supported for arch " +
                           std::to_string(arch::kMinComputeCapability) +
                           " with activation type set to FP8";
-    throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg);
+    throw std::runtime_error("[TensorRT LLM Error][filter_and_run_mixed_gemm] " + err_msg);
   } else {
     generic_mixed_gemm_kernelLauncher<ActivationType, WeightType, ScaleZeroType, BiasType,
                                       OutputType, arch, QuantOp, EpilogueTag, ThreadblockShape,
@@ -266,6 +270,7 @@ void dispatch_gemm_config(ActivationType const* A, WeightType const* B,
                           float const alpha, OutputType* C, int m, int n, int k,
                           int const group_size, tkc::CutlassGemmConfig gemm_config, char* workspace,
                           size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   switch (gemm_config.stages) {
     case 2:
       filter_and_run_mixed_gemm<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
@@ -288,7 +293,7 @@ void dispatch_gemm_config(ActivationType const* A, WeightType const* B,
     default:
       std::string err_msg =
           "dispatch_gemm_config does not support stages " + std::to_string(gemm_config.stages);
-      throw std::runtime_error("[TensorRT-LLm Error][dispatch_gemm_config] " + err_msg);
+      throw std::runtime_error("[TensorRT LLM Error][dispatch_gemm_config] " + err_msg);
       break;
   }
 }
@@ -308,6 +313,8 @@ void dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B,
                               int const group_size, char* workspace, size_t workspace_bytes,
                               tkc::CutlassGemmConfig gemm_config, cudaStream_t stream,
                               int* occupancy = nullptr) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+
   // Don't instantiate configs that are not supported pre-hopper. Produce a sensible error instead.
   constexpr bool any_is_fp8 = is_fp8<ActivationType>() || is_fp8<WeightType>() ||
                               is_fp8<ScaleZeroType>() || is_fp8<BiasType>() || is_fp8<OutputType>();
@@ -362,17 +369,17 @@ void dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B,
         break;
       case tkc::CutlassTileConfig::Undefined:
         throw std::runtime_error(
-            "[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config undefined.");
+            "[TensorRT LLM Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config undefined.");
         break;
       case tkc::CutlassTileConfig::ChooseWithHeuristic:
         throw std::runtime_error(
-            "[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config should have "
+            "[TensorRT LLM Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config should have "
             "already been set by "
             "heuristic.");
         break;
       default:
         throw std::runtime_error(
-            "[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] Config is invalid for mixed "
+            "[TensorRT LLM Error][fpA_intB][dispatch_gemm_to_cutlass] Config is invalid for mixed "
             "type GEMM.");
         break;
     }
@@ -380,7 +387,7 @@ void dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B,
     // This is not a limitation in CUTLASS. We just do not need to support this case.
     std::string err_msg =
         "The activation type must equal the scale, bias and output types on Ampere and earlier.";
-    throw std::runtime_error("[TensorRT-LLm Error][dispatch_gemm_to_cutlass] " + err_msg);
+    throw std::runtime_error("[TensorRT LLM Error][dispatch_gemm_to_cutlass] " + err_msg);
   }
 }
 
@@ -388,6 +395,7 @@ template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuant
           typename ScaleZeroType, typename BiasType, typename OutputType>
 CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
                          OutputType>::CutlassFpAIntBGemmRunner() {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   int device{-1};
   tk::check_cuda_error(cudaGetDevice(&device));
   sm_ = tk::getSMVersion();
@@ -398,7 +406,9 @@ CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, Bia
 template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
           typename ScaleZeroType, typename BiasType, typename OutputType>
 CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
-                         OutputType>::~CutlassFpAIntBGemmRunner() {}
+                         OutputType>::~CutlassFpAIntBGemmRunner() {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+}
 
 template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
           typename ScaleZeroType, typename BiasType, typename OutputType>
@@ -414,6 +424,7 @@ void CutlassFpAIntBGemmRunner<
                                                tkc::CutlassGemmConfig gemm_config,
                                                char* workspace_ptr, const size_t workspace_bytes,
                                                cudaStream_t stream, int* occupancy) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   if (sm_ >= 75 && sm_ < 80) {
     dispatch_gemm_to_cutlass<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
                              cutlass::arch::Sm75, QuantOp, EpilogueTag>(
@@ -429,7 +440,7 @@ void CutlassFpAIntBGemmRunner<
     ((__CUDACC_VER_MAJOR__ < 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 4))
     if constexpr (cutlass::platform::is_same<ActivationType, __nv_fp8_e4m3>::value) {
       throw std::runtime_error(
-          "[TensorRT-LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] INT4xFP8 GEMM for Ada "
+          "[TensorRT LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] INT4xFP8 GEMM for Ada "
           "needs "
           "CUDA>=12.4");
     }
@@ -442,13 +453,13 @@ void CutlassFpAIntBGemmRunner<
     static_assert(!cutlass::platform::is_same<ActivationType, __nv_fp8_e4m3>::value ||
                       cutlass::platform::is_same<ScaleZeroType, half>::value,
                   "ScaleZeroType must be half for activation=fp8");
-    sm90_dispatch_gemm_to_cutlass<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
-                                  QuantOp, EpilogueTag>(
+    cutlass_kernels_oss::sm90_dispatch_gemm_to_cutlass<ActivationType, WeightType, ScaleZeroType,
+                                                       BiasType, OutputType, QuantOp, EpilogueTag>(
         A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
         workspace_ptr, workspace_bytes, gemm_config, stream, occupancy);
   } else {
     throw std::runtime_error(
-        "[TensorRT-LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] Arch unsupported for "
+        "[TensorRT LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] Arch unsupported for "
         "CUTLASS mixed type "
         "GEMM");
   }
@@ -465,6 +476,7 @@ void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType
                                                 tkc::CutlassGemmConfig gemmConfig,
                                                 char* workspace_ptr, const size_t workspace_bytes,
                                                 cudaStream_t stream) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   if constexpr ((QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS) ||
                 (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY)) {
     dispatch_to_arch<tkc::EpilogueOpBias>(
@@ -487,6 +499,7 @@ void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType
                                                 tkc::CutlassGemmConfig gemmConfig,
                                                 char* workspace_ptr, const size_t workspace_bytes,
                                                 cudaStream_t stream) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   gemm(A, B, weight_scales, weight_zero_points, biases, 1.f, C, m, n, k, group_size, gemmConfig,
        workspace_ptr, workspace_bytes, stream);
 }
@@ -500,6 +513,8 @@ void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType
                                                 tkc::CutlassGemmConfig gemmConfig,
                                                 char* workspace_ptr, const size_t workspace_bytes,
                                                 cudaStream_t stream) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+
   if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY) {
     dispatch_to_arch<tkc::EpilogueOpBias>((ActivationType const*)A, (WeightType const*)B,
                                           (ScaleZeroType const*)weight_scales, nullptr, nullptr,
@@ -519,6 +534,7 @@ void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType
                                                 int k, tkc::CutlassGemmConfig gemmConfig,
                                                 char* workspace_ptr, const size_t workspace_bytes,
                                                 cudaStream_t stream) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   gemm(A, B, weight_scales, 1.f, C, m, n, k, gemmConfig, workspace_ptr, workspace_bytes, stream);
 }
 
@@ -543,6 +559,7 @@ template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuant
 size_t CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
                                 OutputType>::getWorkspaceSize(int const m, int const n,
                                                               int const k) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   // For Hopper, we have to allocate large memory size in case for stream-K
   if (sm_ == 90) {
     // https://github.com/NVIDIA/cutlass/blob/19b4c5e065e7e5bbc8082dfc7dbd792bdac850fc/include/cutlass/gemm/kernel/tile_scheduler_params.h#L878-L892
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h
index a81fffde9d..e01dbd279c 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h
@@ -26,7 +26,7 @@
 
 namespace tensorrt_llm {
 namespace kernels {
-namespace cutlass_kernels {
+namespace cutlass_kernels_oss {
 namespace tk = tensorrt_llm::common;
 namespace tkc = tensorrt_llm::cutlass_extensions;
 
@@ -43,6 +43,7 @@ void sm90_dispatch_epilogue_schedules(
     ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha,
     OutputType* C, int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemm_config,
     char* workspace, size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   switch (gemm_config.epilogue_schedule) {
     case tkc::EpilogueScheduleType::AUTO:
       using EpilogueScheduleType =
@@ -57,7 +58,7 @@ void sm90_dispatch_epilogue_schedules(
       break;
     default:
       throw std::runtime_error(
-          "[TensorRT-LLM Error][fpA_intB][sm90_dispatch_epilogue_schedules] epilogue schedule "
+          "[TensorRT LLM Error][fpA_intB][sm90_dispatch_epilogue_schedules] epilogue schedule "
           "config is invalid for "
           "mixed "
           "type GEMM.");
@@ -105,6 +106,8 @@ void sm90_dispatch_mainloop_schedules(
     ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha,
     OutputType* C, int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemm_config,
     char* workspace, size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+
   constexpr bool tile_shapes_supported = are_tile_shapes_supported<CTAShape, ClusterShape>();
 
   if constexpr (tile_shapes_supported) {
@@ -122,7 +125,7 @@ void sm90_dispatch_mainloop_schedules(
         break;
       default:
         throw std::runtime_error(
-            "[TensorRT-LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] mainloop schedule "
+            "[TensorRT LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] mainloop schedule "
             "config is invalid "
             "for "
             "mixed type GEMM.");
@@ -130,7 +133,7 @@ void sm90_dispatch_mainloop_schedules(
     }
   } else {
     throw std::runtime_error(
-        "[TensorRT-LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] Unsupported CTA and "
+        "[TensorRT LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] Unsupported CTA and "
         "Cluster shapes for "
         "mixed type GEMM.");
   }
@@ -146,6 +149,7 @@ void sm90_dispatch_gemm_config(ActivationType const* A, WeightType const* B,
                                int const group_size, tkc::CutlassGemmConfig gemm_config,
                                char* workspace, size_t workspace_bytes, cudaStream_t stream,
                                int* occupancy = nullptr) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   switch (gemm_config.cluster_shape) {
     case tkc::ClusterShape::ClusterShape_1x1x1:
       sm90_dispatch_mainloop_schedules<ActivationType, WeightType, ScaleZeroType, BiasType,
@@ -177,7 +181,7 @@ void sm90_dispatch_gemm_config(ActivationType const* A, WeightType const* B,
       break;
     default:
       throw std::runtime_error(
-          "[TensorRT-LLM Error][fpA_intB][dispatch_CGA_config] Config is invalid for mixed type "
+          "[TensorRT LLM Error][fpA_intB][dispatch_CGA_config] Config is invalid for mixed type "
           "GEMM.");
       break;
   }
@@ -192,6 +196,7 @@ void sm90_dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B,
                                    int const group_size, char* workspace, size_t workspace_bytes,
                                    tkc::CutlassGemmConfig gemm_config, cudaStream_t stream,
                                    int* occupancy = nullptr) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   // Note that SIMT configs are omitted here since they are not supported for fpA_intB.
   // We also only instantiate configs here where threadblockShapeM == warpShapeM since those usually
   // perform the best for mixed type gemms.
@@ -261,22 +266,22 @@ void sm90_dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B,
       break;
     case tkc::CutlassTileConfigSM90::Undefined:
       throw std::runtime_error(
-          "[TensorRT-LLm Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config undefined.");
+          "[TensorRT LLM Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config undefined.");
       break;
     case tkc::CutlassTileConfigSM90::ChooseWithHeuristic:
       throw std::runtime_error(
-          "[TensorRT-LLm Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config should have "
+          "[TensorRT LLM Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config should have "
           "already been set by "
           "heuristic.");
       break;
     default:
       throw std::runtime_error(
-          "[TensorRT-LLm Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] Config is invalid for "
+          "[TensorRT LLM Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] Config is invalid for "
           "mixed type GEMM.");
       break;
   }
 }
 
-}  // namespace cutlass_kernels
+}  // namespace cutlass_kernels_oss
 }  // namespace kernels
 }  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.h
index 6c2098e3c2..64af52cb5d 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.h
@@ -21,7 +21,7 @@
 
 namespace tensorrt_llm {
 namespace kernels {
-namespace cutlass_kernels {
+namespace cutlass_kernels_oss {
 
 template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
           typename OutputType, cutlass::WeightOnlyQuantOp QuantOp, typename EpilogueTag,
@@ -34,6 +34,6 @@ void sm90_generic_mixed_gemm_kernelLauncher(
     tensorrt_llm::cutlass_extensions::CutlassGemmConfig gemm_config, char* workspace,
     size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr);
 
-}  // namespace cutlass_kernels
+}  // namespace cutlass_kernels_oss
 }  // namespace kernels
 }  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl
index 052f388b85..4645aaa00c 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl
@@ -45,7 +45,8 @@
 
 namespace tensorrt_llm {
 namespace kernels {
-namespace cutlass_kernels {
+namespace cutlass_kernels_oss {
+using namespace tensorrt_llm::kernels::cutlass_kernels;
 namespace tk = tensorrt_llm::common;
 namespace tkc = tensorrt_llm::cutlass_extensions;
 
@@ -60,11 +61,13 @@ void sm90_generic_mixed_gemm_kernelLauncher(
     ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha,
     OutputType* C, int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemm_config,
     char* workspace, size_t workspace_bytes, cudaStream_t stream, int* occupancy) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+
 #ifdef COMPILE_HOPPER_TMA_GEMMS
   using CutlassActivationType = typename TllmToCutlassTypeAdapter<ActivationType>::type;
 
   if constexpr (!should_filter_tma_warp_specialized_gemm_problem_shape_v<
-                    cutlass::arch::Sm90, CTAShape, ClusterShape, ActivationType>) {
+                    cutlass::arch::Sm90, CTAShape, ClusterShape, false, ActivationType>) {
     using CutlassWeightType = typename TllmToCutlassTypeAdapter<WeightType>::type;
 
     using CutlassScaleZeroType = typename TllmToCutlassTypeAdapter<ScaleZeroType>::type;
@@ -192,7 +195,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(
       int cta_shape_k = cute::size<2>(TileShape{});
       if (group_size % cta_shape_k != 0) {
         std::string err_msg = "The group size must a multiple of " + std::to_string(cta_shape_k);
-        throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner]" + err_msg);
+        throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner]" + err_msg);
       }
 
       if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY) {
@@ -244,7 +247,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(
 
     Gemm gemm;
     if (gemm.get_workspace_size(args) > workspace_bytes) {
-      TLLM_LOG_ERROR("[TensorRT-LLm Error][fpA_intB Runner] given workspace size insufficient.");
+      TLLM_LOG_ERROR("[TensorRT LLM Error][fpA_intB Runner] given workspace size insufficient.");
     }
 
     auto can_implement = gemm.can_implement(args);
@@ -252,25 +255,25 @@ void sm90_generic_mixed_gemm_kernelLauncher(
       std::string err_msg = "fpA_intB cutlass kernel will fail for params. Error: " +
                             std::string(cutlassGetStatusString(can_implement));
       std::cout << err_msg << std::endl;
-      throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+      throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
     }
 
     auto init_status = gemm.initialize(args, workspace, stream);
     if (init_status != cutlass::Status::kSuccess) {
       std::string err_msg = "Failed to initialize cutlass fpA_intB gemm. Error: " +
                             std::string(cutlassGetStatusString(init_status));
-      throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+      throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
     }
 
     auto run_status = gemm.run(stream);
     if (run_status != cutlass::Status::kSuccess) {
       std::string err_msg = "Failed to run cutlass fpA_intB gemm. Error: " +
                             std::string(cutlassGetStatusString(run_status));
-      throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+      throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
     }
   } else {
     std::stringstream ss;
-    ss << "[TensorRT-LLm Error][fpA_intB Runner] Config (" << (int64_t)cute::size<0>(CTAShape{})
+    ss << "[TensorRT LLM Error][fpA_intB Runner] Config (" << (int64_t)cute::size<0>(CTAShape{})
        << "," << (int64_t)cute::size<1>(CTAShape{}) << "," << (int64_t)cute::size<2>(CTAShape{})
        << ") (" << (int64_t)cute::size<0>(ClusterShape{}) << ","
        << (int64_t)cute::size<1>(ClusterShape{}) << "," << (int64_t)cute::size<2>(ClusterShape{})
@@ -281,12 +284,12 @@ void sm90_generic_mixed_gemm_kernelLauncher(
 
 #else   // COMPILE_HOPPER_TMA_GEMMS
   throw std::runtime_error(
-      "[TensorRT-LLm Error][fpA_intB Runner] Please recompile with support for hopper by passing "
+      "[TensorRT LLM Error][fpA_intB Runner] Please recompile with support for hopper by passing "
       "90-real as an arch "
       "to build_wheel.py.");
 #endif  // COMPILE_HOPPER_TMA_GEMMS
 }
 
-}  // namespace cutlass_kernels
+}  // namespace cutlass_kernels_oss
 }  // namespace kernels
 }  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
index b85decebcd..b77efbcac1 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
@@ -15,10 +15,10 @@
  */
 
 #pragma once
-#include <cuda.h>
 #include <cuda_runtime_api.h>
 
 #include <array>
+#include <optional>
 #include <vector>
 
 #include "./common.h"
@@ -32,17 +32,10 @@
 #include "tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h"
 
 #ifdef ENABLE_FP4
-#if CUDA_VERSION >= 12080
 #include <cuda_fp4.h>
 #endif
-#endif
 
 namespace tensorrt_llm::kernels::cutlass_kernels {
-template <class T>
-constexpr auto transpose_stride(T const& t) {
-  return cute::prepend(cute::prepend(cute::take<2, cute::rank_v<T>>(t), cute::get<0>(t)),
-                       cute::get<1>(t));
-}
 
 template <typename AType, typename BType, typename BScaleType, typename OType>
 struct GroupedGemmInput {
@@ -71,8 +64,6 @@ struct GroupedGemmInput {
 };
 
 struct TmaWarpSpecializedGroupedGemmInput {
-  template <class T>
-  using TransposeStride = decltype(transpose_stride<T>(T{}));
   template <class Tag>
   using TransposeLayoutTag =
       std::conditional_t<std::is_same_v<Tag, cutlass::layout::RowMajor>,
@@ -83,14 +74,24 @@ struct TmaWarpSpecializedGroupedGemmInput {
   static_assert(
       std::is_same_v<cutlass::layout::ColumnMajor, TransposeLayoutTag<cutlass::layout::RowMajor>>);
 
-  // Layout for A and B is transposed and then swapped in the implementation
-  // This uses B^T * A^T = (A * B)^T to get a better layout for the GEMM
-  using LayoutA =
-      TransposeLayoutTag<cutlass::layout::RowMajor>;  // Layout type for A matrix operand
-  using LayoutB =
-      TransposeLayoutTag<cutlass::layout::ColumnMajor>;  // Layout type for B matrix operand
-  using LayoutC =
-      TransposeLayoutTag<cutlass::layout::RowMajor>;  // Layout type for C matrix operand
+  // These are always the layout of A & B matrices, activations and weights will be assigned to
+  // either A or B based on swap_ab
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  // When using Swap A&B we need to transpose the output matrix
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutC_T = TransposeLayoutTag<LayoutC>;
+  using LayoutD_T = TransposeLayoutTag<LayoutD>;
+
+  using StrideA = std::remove_pointer_t<cutlass::detail::TagToStrideA_t<LayoutA*>>;
+  using StrideB = std::remove_pointer_t<cutlass::detail::TagToStrideB_t<LayoutB*>>;
+
+  using StrideC = std::remove_pointer_t<cutlass::detail::TagToStrideC_t<LayoutC*>>;
+  using StrideD = std::remove_pointer_t<cutlass::detail::TagToStrideC_t<LayoutD*>>;
+  using StrideC_T = std::remove_pointer_t<cutlass::detail::TagToStrideC_t<LayoutC_T*>>;
+  using StrideD_T = std::remove_pointer_t<cutlass::detail::TagToStrideC_t<LayoutD_T*>>;
 
   constexpr static int NVFP4BlockScaleVectorSize = 16;
   constexpr static int MXFPXBlockScaleVectorSize = 32;
@@ -121,14 +122,6 @@ struct TmaWarpSpecializedGroupedGemmInput {
     return (dim + alignment - 1) / alignment * alignment;
   }
 
-  using StrideA =
-      std::remove_pointer_t<cutlass::detail::TagToStrideB_t<LayoutA*>>;  // Use B because they will
-                                                                         // be swapped
-  using StrideB =
-      std::remove_pointer_t<cutlass::detail::TagToStrideA_t<LayoutB*>>;  // Use A because they will
-                                                                         // be swapped
-  using StrideC = std::remove_pointer_t<cutlass::detail::TagToStrideC_t<LayoutC*>>;
-
 #ifdef ENABLE_FP8
   template <class T>
   constexpr static bool IsFP8_v =
@@ -144,47 +137,40 @@ struct TmaWarpSpecializedGroupedGemmInput {
 
   using ProblemShape = cutlass::gemm::GroupProblemShape<cute::Shape<int64_t, int64_t, int64_t>>;
 
+  bool swap_ab = false;
   ProblemShape shape_info{};
-  StrideA* stride_a = nullptr;
-  StrideB* stride_b = nullptr;
+  void* stride_act = nullptr;
+  void* stride_weight = nullptr;
 
-  void const** ptr_a = nullptr;
-  void const** ptr_b = nullptr;
+  void const** ptr_act = nullptr;
+  void const** ptr_weight = nullptr;
 
   // C is currently the same in both epilogues
-  StrideC* stride_c = nullptr;
+  void* stride_c = nullptr;
   void const** ptr_c = nullptr;
 
-  struct DefaultEpilogue {
-    using LayoutD =
-        TransposeLayoutTag<cutlass::layout::RowMajor>;  // Layout type for D matrix operand
-    using StrideD = std::remove_pointer_t<cutlass::detail::TagToStrideC_t<LayoutD*>>;
-
-    StrideD* stride_d = nullptr;
-    void** ptr_d = nullptr;
-  };
+  // D is used in all cases except fused finalize
+  void* stride_d = nullptr;
+  void** ptr_d = nullptr;
 
   struct FusedFinalizeEpilogue {
-    using StrideFinalOutput = DefaultEpilogue::StrideD;
-    using StrideBias = TransposeStride<cute::Stride<cute::_0, cute::_1, int>>;
-    using StrideRouterScales = TransposeStride<cute::Stride<cute::_1, cute::_0>>;
+    using StrideFinalOutput_T = cutlass::detail::TagToStrideC_t<LayoutD_T>;
+    using StrideFinalOutput = cutlass::detail::TagToStrideC_t<LayoutD>;
 
     void* ptr_final_output = nullptr;
+    StrideFinalOutput_T stride_final_output_transposed{};
     StrideFinalOutput stride_final_output{};
 
-    void const* ptr_bias = nullptr;
-    StrideBias stride_bias{};
-
-    float const* ptr_router_scales = nullptr;
-    StrideRouterScales stride_router_scales{};
+    void const** ptr_bias = nullptr;
+    float const** ptr_router_scales = nullptr;
 
-    int64_t const* ptr_expert_first_token_offset = nullptr;
-    int const* ptr_source_token_index = nullptr;
+    int const** ptr_source_token_index = nullptr;
+    int num_rows_in_final_output = 0;
+    int shape_override = -1;
 
-    size_t num_rows_in_final_output = 0;
+    bool use_reduction = true;
   };
 
-  DefaultEpilogue default_epilogue;
   FusedFinalizeEpilogue fused_finalize_epilogue;
 
   enum class EpilogueFusion { NONE, ACTIVATION, GATED_ACTIVATION, FINALIZE };
@@ -195,11 +181,11 @@ struct TmaWarpSpecializedGroupedGemmInput {
   using ElementSF = uint8_t;
   using MXFPXElementSF = ElementSF;  // Just an alias for now
   using NVFP4ElementSF = ElementSF;  // Just an alias for now
-  ElementSF const** fpX_block_scaling_factors_A = nullptr;
-  ElementSF const** fpX_block_scaling_factors_B = nullptr;
+  ElementSF const** fpX_block_scaling_factors_act = nullptr;
+  ElementSF const** fpX_block_scaling_factors_weight = nullptr;
 
-  void* fpX_block_scaling_factors_stride_A = nullptr;
-  void* fpX_block_scaling_factors_stride_B = nullptr;
+  void* fpX_block_scaling_factors_stride_act = nullptr;
+  void* fpX_block_scaling_factors_stride_weight = nullptr;
 
   enum class FpXBlockScalingType { MXFPX, NVFP4, NONE };
   FpXBlockScalingType fpX_block_scaling_type = FpXBlockScalingType::NONE;
@@ -231,21 +217,19 @@ struct TmaWarpSpecializedGroupedGemmInput {
   size_t gemm_workspace_size = 0;
 
   // Whether to enable PDL (Programmatic Dependent Launch).
-  bool enable_pdl;
+  bool enable_pdl{};
 
-  static std::array<size_t, 17> workspaceBuffers(int num_experts, FpXBlockScalingType scaling_type);
+  static std::array<size_t, 20> workspaceBuffers(int num_experts, FpXBlockScalingType scaling_type);
 
   static size_t workspaceSize(int num_experts, FpXBlockScalingType scaling_type);
 
   void configureWorkspace(int8_t* start_ptr, int num_experts, void* gemm_workspace,
                           size_t gemm_workspace_size, FpXBlockScalingType scaling_type);
 
-  bool isValid() const { return stride_a != nullptr && ptr_a != nullptr; }
+  bool isValid() const { return stride_act != nullptr && ptr_act != nullptr; }
 
-  void setFinalizeFusionParams(void* final_output, float const* router_scales,
-                               int64_t const* expert_first_token_offset,
-                               int const* source_token_index, void const* bias, int hidden_size,
-                               int num_output_tokens);
+  void setFinalizeFusionParams(void* final_output, int hidden_size, int num_output_tokens,
+                               bool use_reduction);
 
   std::string toString() const;
 };
@@ -275,7 +259,6 @@ class MoeGemmRunner {
 #else
   static constexpr bool use_wfp4a16 = false;
 #endif
-
 #if defined(ENABLE_FP8)
   static constexpr bool use_fp8 =
       (std::is_same_v<T, __nv_fp8_e4m3> || std::is_same_v<T, __nv_fp8_e5m2>) &&
@@ -289,17 +272,16 @@ class MoeGemmRunner {
 #else
   static constexpr bool use_fp8 = false;
   static constexpr bool use_w4afp8 = false;
-  static constexpr bool use_wfp4afp4 = false;
 #endif
   static constexpr bool use_w4_groupwise = use_w4afp8 || use_wfp4a16;
 
 #if defined(ENABLE_FP4)
   static constexpr bool use_fp4 = std::is_same_v<T, __nv_fp4_e2m1>;
-  static constexpr bool use_wfp4afp4 =
+  static constexpr bool use_wfp4afp8 =
       std::is_same_v<T, __nv_fp8_e4m3> && std::is_same_v<WeightType, __nv_fp4_e2m1>;
 #else
   static constexpr bool use_fp4 = false;
-  static constexpr bool use_wfp4afp4 = false;
+  static constexpr bool use_wfp4afp8 = false;
 #endif
 
   void moeGemmBiasAct(GroupedGemmInput<T, WeightType, ScaleBiasType, OutputType> inputs,
@@ -308,15 +290,19 @@ class MoeGemmRunner {
   void moeGemm(GroupedGemmInput<T, WeightType, ScaleBiasType, OutputType> inputs,
                TmaWarpSpecializedGroupedGemmInput hopper_inputs);
 
-  std::vector<cutlass_extensions::CutlassGemmConfig> getConfigs() const;
-  static std::vector<cutlass_extensions::CutlassGemmConfig> getConfigs(int sm);
-  static std::vector<cutlass_extensions::CutlassGemmConfig> getTmaWarpSpecializedConfigs(int sm);
-  static std::vector<cutlass_extensions::CutlassGemmConfig> getBlackwellConfigs(int sm);
-  static std::vector<cutlass_extensions::CutlassGemmConfig> getHopperConfigs(int sm);
+  std::vector<cutlass_extensions::CutlassGemmConfig> getConfigs(
+      bool supports_finalize_fusion) const;
+  static std::vector<cutlass_extensions::CutlassGemmConfig> getConfigs(
+      int sm, bool supports_finalize_fusion);
+  static std::vector<cutlass_extensions::CutlassGemmConfig> getTmaWarpSpecializedConfigs(
+      int sm, bool supports_finalize_fusion);
   static std::vector<cutlass_extensions::CutlassGemmConfig> getAmpereConfigs(int sm);
 
   [[nodiscard]] bool isTmaWarpSpecialized(cutlass_extensions::CutlassGemmConfig gemm_config) const;
-  [[nodiscard]] bool supportsTmaWarpSpecialized() const;
+
+  [[nodiscard]] bool supportsTmaWarpSpecialized() const { return supportsTmaWarpSpecialized(sm_); }
+
+  [[nodiscard]] static bool supportsTmaWarpSpecialized(int sm);
   [[nodiscard]] bool isFusedGatedActivation(cutlass_extensions::CutlassGemmConfig gemm_config,
                                             ActivationType activation_type, int gemm_n,
                                             int gemm_k) const;
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
index 2ad23d7f7d..e278269b97 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
@@ -15,6 +15,8 @@
  */
 
 #pragma once
+#include <cstdint>
+
 #include "cutlass/gemm/gemm.h"
 #include "moe_gemm_kernels.h"
 #include "tensorrt_llm/common/assert.h"
@@ -219,6 +221,8 @@ struct MOEParallelismConfig {
   }
 };
 
+enum class MoeGemmId : int { Undefined = 0, GEMM_1, GEMM_2 };
+
 struct QuantParams {
   // Int weight only quantization params
   struct {
@@ -426,14 +430,15 @@ class CutlassMoeFCRunnerInterface {
                                   bool use_awq) = 0;
   virtual void setTactic(std::optional<cutlass_extensions::CutlassGemmConfig> gemm1_config,
                          std::optional<cutlass_extensions::CutlassGemmConfig> gemm2_config) = 0;
-  virtual std::vector<cutlass_extensions::CutlassGemmConfig> getTactics() = 0;
+  virtual std::vector<cutlass_extensions::CutlassGemmConfig> getTactics(MoeGemmId gemm_id) = 0;
 
   virtual void runMoe(void const* input_activations, void const* input_sf,
-                      int const* token_selected_experts, float const* token_final_scales,
-                      void const* fc1_expert_weights, void const* fc1_expert_biases,
-                      ActivationParams fc1_activation_type, void const* fc2_expert_weights,
-                      void const* fc2_expert_biases, QuantParams quant_params,
-                      int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
+                      bool const swizzled_input_sf, int const* token_selected_experts,
+                      float const* token_final_scales, void const* fc1_expert_weights,
+                      void const* fc1_expert_biases, ActivationParams fc1_activation_type,
+                      void const* fc2_expert_weights, void const* fc2_expert_biases,
+                      QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size,
+                      int64_t const unpadded_hidden_size, int64_t const inter_size,
                       int const num_experts, int const experts_per_token, char* workspace_ptr,
                       void* final_output, int* unpermuted_row_to_permuted_row,
                       MOEParallelismConfig parallelism_config, bool const enable_alltoall,
@@ -459,26 +464,24 @@ class CutlassMoeFCRunnerInterface {
                      int* num_active_experts_per, int* active_expert_global_ids,
                      bool enable_pdl) = 0;
 
-  virtual void gemm2(void const* const input, void* const gemm_output, void* const final_output,
-                     int64_t const* const expert_first_token_offset,
-                     TmaWarpSpecializedGroupedGemmInput const tma_ws_input_template,
-                     void const* const fc2_expert_weights, void const* const fc2_expert_biases,
-                     void const* const fc2_int_scales, float const* const fc2_fp8_dequant,
-                     TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc2_fp4_act_flat,
-                     QuantParams quant_params, float const* const token_topk_unpermuted_scales,
-                     float const* const token_topk_permuted_scales,
-                     int const* const unpermuted_row_to_permuted_row,
-                     int const* permuted_row_to_unpermuted_row,
-                     int const* const token_selected_experts,
-                     int64_t const* const num_valid_tokens_ptr, int64_t const num_rows,
-                     int64_t const expanded_num_rows, int64_t const hidden_size,
-                     int64_t const inter_size, int const num_experts_per_node,
-                     int64_t const experts_per_token, float const** alpha_scale_ptr_array,
-                     bool use_lora, void* fc2_lora, bool use_deepseek_fp8_block_scale,
-                     cudaStream_t stream, MOEParallelismConfig parallelism_config,
-                     bool const enable_alltoall, cutlass_extensions::CutlassGemmConfig config,
-                     bool min_latency_mode, int* num_active_experts_per,
-                     int* active_expert_global_ids, bool enable_pdl) = 0;
+  virtual void gemm2(
+      void const* const input, void* const gemm_output, void* const final_output,
+      int64_t const* const expert_first_token_offset,
+      TmaWarpSpecializedGroupedGemmInput const tma_ws_input_template,
+      void const* const fc2_expert_weights, void const* const fc2_expert_biases,
+      void const* const fc2_int_scales, float const* const fc2_fp8_dequant,
+      TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc2_fp4_act_flat,
+      QuantParams quant_params, float const* const token_topk_unpermuted_scales,
+      float const* const token_topk_permuted_scales,
+      int const* const unpermuted_row_to_permuted_row, int const* permuted_row_to_unpermuted_row,
+      int const* const token_selected_experts, int64_t const* const num_valid_tokens_ptr,
+      int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size,
+      int64_t const unpadded_hidden_size, int64_t const inter_size, int const num_experts_per_node,
+      int64_t const experts_per_token, float const** alpha_scale_ptr_array, bool use_lora,
+      void* fc2_lora, bool use_deepseek_fp8_block_scale, cudaStream_t stream,
+      MOEParallelismConfig parallelism_config, bool const enable_alltoall,
+      cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode,
+      int* num_active_experts_per, int* active_expert_global_ids, bool enable_pdl) = 0;
 
   virtual std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput>
   computeStridesTmaWarpSpecializedDispatch(
@@ -490,7 +493,8 @@ class CutlassMoeFCRunnerInterface {
       float const* alpha_scale_flat2,
       TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1,
       TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params,
-      void const* bias1, void const* bias2, void* gemm1_output, void* gemm2_output, bool enable_pdl,
+      void const* bias1, void const* bias2, void* gemm1_output, void* gemm2_output,
+      float const* router_scales, int const* permuted_row_to_unpermuted_row, bool enable_pdl,
       cudaStream_t stream) = 0;
 
   virtual std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput>
@@ -509,13 +513,13 @@ class CutlassMoeFCRunnerInterface {
   virtual size_t getGemmWorkspaceSize(int num_experts_per_node) const = 0;
 
   bool is_profiler = false;
-  bool use_deterministic_hopper_reduce_ = false;
+  bool use_fused_finalize_ = true;
 };
 
 // Assumes inputs activations are row major. Weights need to be preprocessed by
 // th_op/weight_quantize.cc . Nested in a class to avoid multiple calls to cudaGetDeviceProperties
 // as this call can be expensive. Avoid making several duplicates of this class.
-template <typename T,                         /*The type used for activations*/
+template <typename T,                         /* The type used for activations */
           typename WeightType,                /* The type for the MoE weights */
           typename OutputType = T,            /* The type for the MoE final output */
           typename InputType = T,             /* The type for the MoE input */
@@ -538,13 +542,13 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
 #else
   static constexpr bool use_wfp4a16 = false;
 #endif
-
 #if defined(ENABLE_FP8)
   static constexpr bool use_fp8 =
       (std::is_same_v<T, __nv_fp8_e4m3> || std::is_same_v<T, __nv_fp8_e5m2>) &&
       !std::is_same_v<WeightType, cutlass::uint4b_t>;
   static constexpr bool use_w4afp8 =
       std::is_same_v<WeightType, cutlass::uint4b_t> && std::is_same_v<T, __nv_fp8_e4m3>;
+  static constexpr bool use_fp8_input = std::is_same_v<InputType, __nv_fp8_e4m3>;
   static_assert(!std::is_same_v<BackBoneType, __nv_fp8_e4m3>,
                 "Current logic requires backbone type to be >=16-bits");
   static_assert(!std::is_same_v<OutputType, __nv_fp8_e4m3>,
@@ -601,25 +605,26 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
     gemm2_config_ = std::move(gemm2_config);
   }
 
-  std::vector<cutlass_extensions::CutlassGemmConfig> getTactics() override {
-    return moe_gemm_runner_.getConfigs();
+  std::vector<cutlass_extensions::CutlassGemmConfig> getTactics(MoeGemmId gemm_id) override {
+    return moe_gemm_runner_.getConfigs(gemm_id == MoeGemmId::GEMM_2 && mayHaveFinalizeFused());
   }
 
-  static std::vector<cutlass_extensions::CutlassGemmConfig> getTactics(int sm) {
+  static std::vector<cutlass_extensions::CutlassGemmConfig> getTactics(int sm, MoeGemmId gemm_id) {
     using RunnerType = decltype(moe_gemm_runner_);
-    return RunnerType::getConfigs(sm);
+    return RunnerType::getConfigs(sm,
+                                  gemm_id == MoeGemmId::GEMM_2 && Self::mayHaveFinalizeFused(sm));
   }
 
-  void runMoe(void const* input_activations, void const* input_sf,
+  void runMoe(void const* input_activations, void const* input_sf, bool const swizzled_input_sf,
               int const* token_selected_experts, float const* token_final_scales,
               void const* fc1_expert_weights, void const* fc1_expert_biases,
               ActivationParams fc1_activation_type, void const* fc2_expert_weights,
               void const* fc2_expert_biases, QuantParams quant_params, int64_t const num_rows,
-              int64_t const hidden_size, int64_t const inter_size, int const num_experts,
-              int const experts_per_token, char* workspace_ptr, void* final_output,
-              int* unpermuted_row_to_permuted_row, MOEParallelismConfig parallelism_config,
-              bool const enable_alltoall, bool use_lora, LoraParams& lora_params,
-              bool use_deepseek_fp8_block_scale, bool min_latency_mode,
+              int64_t const hidden_size, int64_t const unpadded_hidden_size,
+              int64_t const inter_size, int const num_experts, int const experts_per_token,
+              char* workspace_ptr, void* final_output, int* unpermuted_row_to_permuted_row,
+              MOEParallelismConfig parallelism_config, bool const enable_alltoall, bool use_lora,
+              LoraParams& lora_params, bool use_deepseek_fp8_block_scale, bool min_latency_mode,
               MoeMinLatencyParams& min_latency_params, bool enable_pdl,
               cudaStream_t stream) override;
 
@@ -663,11 +668,12 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
       int const* const unpermuted_row_to_permuted_row, int const* permuted_row_to_unpermuted_row,
       int const* const token_selected_experts, int64_t const* const num_valid_tokens_ptr,
       int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size,
-      int64_t const inter_size, int const num_experts_per_node, int64_t const experts_per_token,
-      float const** alpha_scale_ptr_array, bool use_lora, void* fc2_lora, cudaStream_t stream,
-      MOEParallelismConfig parallelism_config, bool const enable_alltoall,
-      cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode,
-      int* num_active_experts_per, int* active_expert_global_ids, bool enable_pdl);
+      int64_t const unpadded_hidden_size, int64_t const inter_size, int const num_experts_per_node,
+      int64_t const experts_per_token, float const** alpha_scale_ptr_array, bool use_lora,
+      void* fc2_lora, cudaStream_t stream, MOEParallelismConfig parallelism_config,
+      bool const enable_alltoall, cutlass_extensions::CutlassGemmConfig config,
+      bool min_latency_mode, int* num_active_experts_per, int* active_expert_global_ids,
+      bool enable_pdl);
 
   // Overrides to allow us to forward on to the internal functions with the pointers using the
   // correct type
@@ -710,7 +716,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
              int const* const unpermuted_row_to_permuted_row,
              int const* permuted_row_to_unpermuted_row, int const* const token_selected_experts,
              int64_t const* const num_valid_tokens_ptr, int64_t const num_rows,
-             int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size,
+             int64_t const expanded_num_rows, int64_t const hidden_size,
+             int64_t const unpadded_hidden_size, int64_t const inter_size,
              int const num_experts_per_node, int64_t const experts_per_token,
              float const** alpha_scale_ptr_array, bool use_lora, void* fc2_lora,
              bool use_deepseek_fp8_block_scale, cudaStream_t stream,
@@ -727,10 +734,10 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
         static_cast<ScaleBiasType const*>(fc2_int_scales), fc2_fp8_dequant, fc2_fp4_act_flat,
         quant_params, token_topk_unpermuted_scales, token_topk_permuted_scales,
         unpermuted_row_to_permuted_row, permuted_row_to_unpermuted_row, token_selected_experts,
-        num_valid_tokens_ptr, num_rows, expanded_num_rows, hidden_size, inter_size,
-        num_experts_per_node, experts_per_token, alpha_scale_ptr_array, use_lora, fc2_lora, stream,
-        parallelism_config, enable_alltoall, config, min_latency_mode, num_active_experts_per,
-        active_expert_global_ids, enable_pdl);
+        num_valid_tokens_ptr, num_rows, expanded_num_rows, hidden_size, unpadded_hidden_size,
+        inter_size, num_experts_per_node, experts_per_token, alpha_scale_ptr_array, use_lora,
+        fc2_lora, stream, parallelism_config, enable_alltoall, config, min_latency_mode,
+        num_active_experts_per, active_expert_global_ids, enable_pdl);
   }
 
   virtual size_t getGemmWorkspaceSize(int num_experts_per_node) const override {
@@ -747,7 +754,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
       float const* alpha_scale_flat2,
       TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1,
       TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params,
-      void const* bias1, void const* bias2, void* gemm1_output, void* gemm2_output, bool enable_pdl,
+      void const* bias1, void const* bias2, void* gemm1_output, void* gemm2_output,
+      float const* router_scales, int const* permuted_row_to_unpermuted_row, bool enable_pdl,
       cudaStream_t stream) override {
     return Self::computeStridesTmaWarpSpecialized(
         expert_first_token_offset, layout_info1, layout_info2, num_tokens, expanded_num_tokens,
@@ -758,7 +766,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
         fp4_act_flat1, fp4_act_flat2, quant_params, reinterpret_cast<ScaleBiasType const*>(bias1),
         reinterpret_cast<ScaleBiasType const*>(bias2),
         reinterpret_cast<UnfusedGemmOutputType*>(gemm1_output),
-        reinterpret_cast<UnfusedGemmOutputType*>(gemm2_output), enable_pdl, stream);
+        reinterpret_cast<UnfusedGemmOutputType*>(gemm2_output), router_scales,
+        permuted_row_to_unpermuted_row, enable_pdl, stream);
   }
 
   std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput>
@@ -789,8 +798,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
   std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput>
   setupTmaWarpSpecializedInputs(int64_t num_rows, int64_t expanded_num_rows,
                                 ActivationParams fc1_activation_type, int64_t hidden_size,
-                                int64_t inter_size, int64_t num_experts_per_node,
-                                void const* input_activations_void,
+                                int64_t unpadded_hidden_size, int64_t inter_size,
+                                int64_t num_experts_per_node, void const* input_activations_void,
                                 TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf,
                                 void* final_output, WeightType const* fc1_expert_weights,
                                 WeightType const* fc2_expert_weights, QuantParams quant_params,
@@ -811,7 +820,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
       TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1,
       TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params,
       ScaleBiasType const* bias1, ScaleBiasType const* bias2, UnfusedGemmOutputType* gemm1_output,
-      UnfusedGemmOutputType* gemm2_output, bool enable_pdl, cudaStream_t stream);
+      UnfusedGemmOutputType* gemm2_output, float const* router_scales,
+      int const* permuted_row_to_unpermuted_row, bool enable_pdl, cudaStream_t stream);
   static std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput>
   computeStridesTmaWarpSpecializedLowLatency(
       TmaWarpSpecializedGroupedGemmInput layout_info1,
@@ -844,8 +854,13 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
   }
 
   bool mayHaveFinalizeFused() const {
-    return moe_gemm_runner_.supportsTmaWarpSpecialized() && moe_gemm_runner_.getSM() == 90 &&
-           !use_deterministic_hopper_reduce_ && !use_w4_groupwise;
+    return moe_gemm_runner_.supportsTmaWarpSpecialized() && moe_gemm_runner_.getSM() >= 90 &&
+           use_fused_finalize_ && !use_w4_groupwise;
+  }
+
+  static bool mayHaveFinalizeFused(int sm) {
+    using RunnerType = decltype(moe_gemm_runner_);
+    return RunnerType::supportsTmaWarpSpecialized(sm) && sm >= 90 && !use_w4_groupwise;
   }
 
   // TODO: This should eventually take the quant params to give more flexibility
@@ -891,7 +906,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
       int const* const unpermuted_row_to_permuted_row,
       int const* const permuted_row_to_unpermuted_row, int const* const token_selected_experts,
       int64_t const* const num_valid_tokens_ptr, int64_t const num_rows,
-      int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size,
+      int64_t const expanded_num_rows, int64_t const hidden_size,
+      int64_t const unpadded_hidden_size, int64_t const inter_size,
       int64_t const num_experts_per_node, int64_t const k, MOEParallelismConfig parallelism_config,
       bool const enable_alltoall, QuantParams& quant_params, bool enable_pdl, cudaStream_t stream);
 
@@ -951,14 +967,14 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
 struct GemmProfilerBackend {
  public:
   using Config = cutlass_extensions::CutlassGemmConfig;
-  enum class GemmToProfile { Undefined = 0, GEMM_1, GEMM_2 };
+  using GemmToProfile = MoeGemmId;
 
   void init(CutlassMoeFCRunnerInterface& runner, GemmToProfile gemm_to_profile,
             nvinfer1::DataType dtype, nvinfer1::DataType wtype, nvinfer1::DataType otype,
-            int num_experts, int k, int64_t hidden_size, int64_t inter_size, int64_t group_size,
-            ActivationType activation_type, bool bias, bool use_lora, bool min_latency_mode,
-            bool need_weights, MOEParallelismConfig parallelism_config,
-            bool const enable_alltoall) {
+            int num_experts, int k, int64_t hidden_size, int64_t unpadded_hidden_size,
+            int64_t inter_size, int64_t group_size, ActivationType activation_type, bool bias,
+            bool use_lora, bool min_latency_mode, bool need_weights,
+            MOEParallelismConfig parallelism_config, bool const enable_alltoall) {
     mInterface = &runner;
     mGemmToProfile = gemm_to_profile;
     mDType = dtype;
@@ -968,6 +984,7 @@ struct GemmProfilerBackend {
     mNumExpertsPerNode = num_experts / parallelism_config.ep_size;
     mK = k;
     mExpertHiddenSize = hidden_size;
+    mExpertUnpaddedHiddenSize = unpadded_hidden_size;
     mExpertInterSize = inter_size;  // Already divided by tp_size
     mGroupSize = group_size;
     mActivationType = activation_type;
@@ -1001,12 +1018,12 @@ struct GemmProfilerBackend {
   CutlassMoeFCRunnerInterface* mInterface;
 
   GemmToProfile mGemmToProfile = GemmToProfile::Undefined;
-  std::vector<Config> mAllTacticsSaved;
   int mSM{};
   int64_t mNumExperts{};
   int64_t mNumExpertsPerNode{};
   int64_t mK{};
   int64_t mExpertHiddenSize{};
+  int64_t mExpertUnpaddedHiddenSize{};
   int64_t mExpertInterSize{};
   int64_t mGroupSize{};
   ActivationType mActivationType{};
@@ -1022,7 +1039,11 @@ struct GemmProfilerBackend {
   // This will be a unique value for every iteration of warmup and actual bench
   constexpr static int64_t NUM_ROUTING_SAMPLES = 16;
 
-  std::array<TmaWarpSpecializedGroupedGemmInput, NUM_ROUTING_SAMPLES> mTmaInputCache;
+  constexpr static int64_t NUM_FUSION_TYPES = 2;
+  constexpr static int64_t NUM_SWAP_AB_TYPES = 2;
+  constexpr static int64_t NUM_WORKSPACES = NUM_FUSION_TYPES * NUM_SWAP_AB_TYPES;
+  TmaWarpSpecializedGroupedGemmInput mTmaInputCache[NUM_FUSION_TYPES][NUM_SWAP_AB_TYPES]
+                                                   [NUM_ROUTING_SAMPLES];
   QuantParams mQuantParams;
 
   bool mBias{};
@@ -1036,6 +1057,7 @@ struct GemmProfilerBackend {
   void prepareRouting(int num_tokens, char* workspace, bool enable_pdl, cudaStream_t stream);
   void prepareQuantParams(int num_tokens, char* workspace, cudaStream_t stream);
   void prepareTmaWsInputs(int num_tokens, char* workspace, void const* expert_weights,
+                          TmaWarpSpecializedGroupedGemmInput::EpilogueFusion fusion, bool swap_ab,
                           bool enable_pdl, cudaStream_t stream);
 };
 
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_util_kernels.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_util_kernels.h
index e701d72fe7..01f107d095 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_util_kernels.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_util_kernels.h
@@ -64,18 +64,18 @@ void expandInputRowsKernelLauncher(
     int const k, int const num_experts_per_node, QuantParams const& quant_params,
     bool use_per_expert_act_scale, int64_t* expert_first_token_offset,
     TmaWarpSpecializedGroupedGemmInput::ElementSF* fc1_act_sf_flat,
-    TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, void const* prequant_scales,
-    bool enable_pdl, cudaStream_t stream);
+    TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, bool const swizzled_input_sf,
+    void const* prequant_scales, bool enable_pdl, cudaStream_t stream);
 
 template <class OutputType, class GemmOutputType, class ScaleBiasType>
 void finalizeMoeRoutingKernelLauncher(
     GemmOutputType const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
     ScaleBiasType const* bias, float const* final_scales, int const* unpermuted_row_to_permuted_row,
     int const* permuted_row_to_unpermuted_row, int const* token_selected_experts,
-    int64_t const* expert_first_token_offset, int64_t const num_rows, int64_t const cols,
-    int64_t const experts_per_token, int64_t const num_experts_per_node,
-    MOEParallelismConfig parallelism_config, bool const enable_alltoall, bool enable_pdl,
-    cudaStream_t stream);
+    int64_t const* expert_first_token_offset, int64_t const num_rows, int64_t const padded_cols,
+    int64_t const unpadded_cols, int64_t const experts_per_token,
+    int64_t const num_experts_per_node, MOEParallelismConfig parallelism_config,
+    bool const enable_alltoall, bool enable_pdl, cudaStream_t stream);
 
 }  // namespace cutlass_kernels
 }  // namespace tensorrt_llm::kernels
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.h
index da4be7c179..9d493b8ef0 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.h
@@ -1,16 +1,20 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
-namespace tensorrt_llm::kernels::cutlass_kernels {
+namespace tensorrt_llm::kernels::cutlass_kernels_oss {
 template <typename ElementType_, typename CutlassWeightType_, int MaxTileM_, int TileN_, int TileK_,
           int Stages_, typename EpilogueTag>
 void sm80_generic_fused_moe_gemm_kernelLauncher(ElementType_ const* A, CutlassWeightType_ const* B,
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl
index 9a9ecafcd6..8ecc3fc18b 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl
@@ -1,28 +1,30 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
-#include <cutlass_extensions/epilogue_helpers.h>
-#include <tensorrt_llm/common/cudaUtils.h>
-
-#include <cutlass_extensions/gemm/kernel/fused_moe_kernel.cuh>
-
 #include "cute/tensor.hpp"
 #include "cutlass/array.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/device/gemm_grouped.h"
 #include "cutlass/gemm/kernel/default_gemm_grouped.h"
 #include "cutlass/numeric_conversion.h"
+#include "cutlass_extensions/epilogue_helpers.h"
+#include "cutlass_extensions/gemm/kernel/fused_moe_kernel.cuh"
+#include "tensorrt_llm/common/cudaUtils.h"
 
-namespace tensorrt_llm::kernels::cutlass_kernels {
+namespace tensorrt_llm::kernels::cutlass_kernels_oss {
 template <typename ElementType_, typename CutlassWeightType_, int MaxTileM_, int TileN_, int TileK_,
           int Stages_, typename EpilogueTag>
 void sm80_generic_fused_moe_gemm_kernelLauncher(ElementType_ const* A, CutlassWeightType_ const* B,
@@ -93,4 +95,4 @@ void sm80_generic_fused_moe_gemm_kernelLauncher(ElementType_ const* A, CutlassWe
   TLLM_CHECK_WITH_INFO(result == cudaSuccess, "Fail to execute fused moe kernel, cuda error %d\n",
                        (int)(result));
 }
-}  // namespace tensorrt_llm::kernels::cutlass_kernels
+}  // namespace tensorrt_llm::kernels::cutlass_kernels_oss
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.h
index badc07b574..ae2ad222b3 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.h
@@ -18,16 +18,19 @@
 
 #include <cuda_runtime_api.h>
 
-#include "moe_gemm_kernels.h"
-
-namespace tensorrt_llm::kernels::cutlass_kernels {
+#include "../../include/moe_gemm_kernels.h"
 
+namespace tensorrt_llm::kernels::cutlass_kernels_oss {
+using tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput;
 // Keep in sync with the signature generated by generate_kernels.py
-template <typename Arch, typename T, typename WeightType, typename OutputType, typename EpilogueTag,
+template <typename Arch, typename T, typename WeightType, typename OutputType,
+          typename EpilogueSchedule, typename EpilogueTag,
           TmaWarpSpecializedGroupedGemmInput::EpilogueFusion FUSION, typename TileShape,
-          typename ClusterShape, bool IsMXFPX, bool BIAS>
+          typename ClusterShape, bool IsMXFPX, bool DYNAMIC_CGA, bool BIAS, bool SwapAB>
 void tma_warp_specialized_generic_moe_gemm_kernelLauncher(
     TmaWarpSpecializedGroupedGemmInput hopper_input, int num_experts, int multi_processor_count,
-    cudaStream_t stream, int* kernel_occupancy, size_t* workspace_size);
+    cudaStream_t stream, int* kernel_occupancy, size_t* workspace_size,
+    cute::Shape<int32_t, int32_t, cute::_1> dynamic_cluster_shape,
+    cute::Shape<int32_t, int32_t, cute::_1> fallback_cluster_shape);
 
-}  // namespace tensorrt_llm::kernels::cutlass_kernels
+}  // namespace tensorrt_llm::kernels::cutlass_kernels_oss
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl
index a3e4a87398..db5788bfdd 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,13 +18,13 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 
+#include "../../include/moe_gemm_kernels.h"
 #include "../moe_tma_warp_specialized_traits.h"
 #include "cute/tensor.hpp"
 #include "cutlass/array.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/epilogue/collective/default_epilogue.hpp"
-#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/fusion/operations.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 #include "cutlass/gemm/device/gemm_grouped.h"
 #include "cutlass/gemm/device/gemm_universal_adapter.h"
@@ -33,14 +33,7 @@
 #include "cutlass/gemm/kernel/default_gemm_grouped.h"
 #include "cutlass/gemm/kernel/gemm_universal.hpp"
 #include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass_extensions/compute_occupancy.h"
-#include "cutlass_extensions/epilogue/collective/epilogue_moe_finalize.hpp"
-#include "cutlass_extensions/epilogue_helpers.h"
-#include "cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h"
-#include "cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h"
-#include "cutlass_extensions/gemm/threadblock/default_mma.h"
-#include "moe_gemm_kernels.h"
+#include "cutlass_extensions/epilogue/fusion/sm90_visitor_scatter.hpp"
 #include "moe_gemm_tma_ws_launcher.h"
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/cudaUtils.h"
@@ -58,7 +51,8 @@
 
 namespace tensorrt_llm {
 namespace kernels {
-namespace cutlass_kernels {
+namespace cutlass_kernels_oss {
+using namespace tensorrt_llm::kernels::cutlass_kernels;
 using EpilogueFusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion;
 
 // Constructs an object with specific arguments only if flag is true
@@ -76,8 +70,18 @@ ReturnType construct_if_true(Args&&... args) {
 template <bool FLAG, class GemmGrouped, bool A>
 auto deduce_layout_sf() {
   if constexpr (FLAG && A) {
+    // In moe_kernels.cu we rely on these two types being the same. This is not necessarily
+    // guaranteed by cutlass so we have a sanity check here.
+    static_assert(std::is_same_v<typename GemmGrouped::GemmKernel::CollectiveMainloop::LayoutSFA,
+                                 typename GemmGrouped::GemmKernel::CollectiveMainloop::LayoutSFB>,
+                  "Deduced layout SF does not match for A and B");
     return typename GemmGrouped::GemmKernel::CollectiveMainloop::LayoutSFA{};
   } else if constexpr (FLAG && !A) {
+    // In moe_kernels.cu we rely on these two types being the same. This is not necessarily
+    // guaranteed by cutlass so we have a sanity check here.
+    static_assert(std::is_same_v<typename GemmGrouped::GemmKernel::CollectiveMainloop::LayoutSFA,
+                                 typename GemmGrouped::GemmKernel::CollectiveMainloop::LayoutSFB>,
+                  "Deduced layout SF does not match for A and B");
     return typename GemmGrouped::GemmKernel::CollectiveMainloop::LayoutSFB{};
   } else {
     return (void*)nullptr;
@@ -85,18 +89,21 @@ auto deduce_layout_sf() {
 }
 
 template <typename ArchTag, typename T, typename WeightType, typename OutputType,
-          typename EpilogueTag, EpilogueFusion FUSION, typename TileShape, typename ClusterShape,
-          bool IsMXFPX, bool BIAS>
+          typename EpilogueSchedule, typename EpilogueTag, EpilogueFusion FUSION,
+          typename TileShape, typename ClusterShape, bool IsMXFPX, bool DYNAMIC_CGA, bool BIAS,
+          bool SwapAB>
 struct DispatchToTmaWSFunction {};
 
 // TMA WS specialized version
 template <typename ArchTag, typename T, typename WeightType, typename OutputType,
-          typename EpilogueTag, EpilogueFusion FUSION, typename TileShape, typename ClusterShape,
-          bool IsMXFPX, bool BIAS>
+          typename EpilogueSchedule, typename EpilogueTag, EpilogueFusion FUSION,
+          typename TileShape, typename ClusterShape, bool IsMXFPX, bool DYNAMIC_CGA, bool BIAS,
+          bool SwapAB>
 void tma_warp_specialized_generic_moe_gemm_kernelLauncher(
     TmaWarpSpecializedGroupedGemmInput tma_ws_input, int num_experts,
     int const multi_processor_count, cudaStream_t stream, int* kernel_occupancy,
-    size_t* workspace_size) {
+    size_t* workspace_size, cute::Shape<int32_t, int32_t, cute::_1> dynamic_cluster_shape,
+    cute::Shape<int32_t, int32_t, cute::_1> fallback_cluster_shape) {
   if constexpr (ArchTag::kMinComputeCapability < 90) {
     TLLM_THROW("Invalid architecture instantiated");
   }
@@ -115,6 +122,14 @@ void tma_warp_specialized_generic_moe_gemm_kernelLauncher(
         "build_wheel.py.");
   }
 #endif
+#ifndef COMPILE_BLACKWELL_SM103_TMA_GROUPED_GEMMS
+  else if constexpr (ArchTag::kMinComputeCapability == 103) {
+    // fallback sm100f logic is done in dispatchMoeGemmFinalDispatchTmaWarpSpecialized
+    TLLM_THROW(
+        "Please recompile with support for blackwell by passing 103-real as an arch to "
+        "build_wheel.py.");
+  }
+#endif
 #ifndef COMPILE_BLACKWELL_SM120_TMA_GROUPED_GEMMS
   else if constexpr (ArchTag::kMinComputeCapability >= 120) {
     TLLM_THROW(
@@ -123,10 +138,13 @@ void tma_warp_specialized_generic_moe_gemm_kernelLauncher(
   }
 #endif
   else {
-    return DispatchToTmaWSFunction<ArchTag, T, WeightType, OutputType, EpilogueTag, FUSION,
-                                   TileShape, ClusterShape, IsMXFPX,
-                                   BIAS>::op(tma_ws_input, num_experts, multi_processor_count,
-                                             stream, kernel_occupancy, workspace_size);
+    return DispatchToTmaWSFunction<ArchTag, T, WeightType, OutputType, EpilogueSchedule,
+                                   EpilogueTag, FUSION, TileShape, ClusterShape, IsMXFPX,
+                                   DYNAMIC_CGA, BIAS, SwapAB>::op(tma_ws_input, num_experts,
+                                                                  multi_processor_count, stream,
+                                                                  kernel_occupancy, workspace_size,
+                                                                  dynamic_cluster_shape,
+                                                                  fallback_cluster_shape);
   }
 }
 
@@ -164,482 +182,553 @@ using SafeBF16 = __nv_bfloat16;
 using SafeBF16 = void;
 #endif
 
+using namespace cutlass::epilogue;
+
 // TODO Revert this back to a template instantiation once compiler bug is resolved
-#define INSTANTIATE_TMA_WARP_SPECIALIZED_MOE_GEMM(ArchTag_, DataType_, WeightType_, OutputType_,                                                                                                                                            \
-                                                  EpilogueTag_, FUSION_, CTA_M_, CTA_N_, CTA_K_,                                                                                                                                            \
-                                                  CGA_M_, CGA_N_, CGA_K_, MXFPX_, BIAS_)                                                                                                                                                    \
-  static void                                                                                                                                                                                                                               \
-      tma_warp_specialized_generic_moe_gemm_kernelLauncher_##ArchTag_##_##DataType_##_##WeightType_##_##OutputType_##_##EpilogueTag_##_##FUSION_##_##CTA_M_##_##CTA_N_##_##CTA_K_##_##CGA_M_##_##CGA_N_##_##CGA_K_##_##MXFPX_##_##BIAS_(    \
-          TmaWarpSpecializedGroupedGemmInput tma_ws_input, int num_experts,                                                                                                                                                                 \
-          int const multi_processor_count, cudaStream_t stream, int* kernel_occupancy,                                                                                                                                                      \
-          size_t* workspace_size) {                                                                                                                                                                                                         \
-    constexpr static EpilogueFusion FUSION = EpilogueFusion::FUSION_;                                                                                                                                                                       \
-    /* constexpr static bool BIAS = BIAS_; */ /* Always false */                                                                                                                                                                            \
-    using ArchTag = cutlass::arch::ArchTag_;                                                                                                                                                                                                \
-    using T = DataType_;                                                                                                                                                                                                                    \
-    using WeightType = WeightType_;                                                                                                                                                                                                         \
-    using OutputType = OutputType_;                                                                                                                                                                                                         \
-    using EpilogueTag = tensorrt_llm::cutlass_extensions::EpilogueTag_;                                                                                                                                                                     \
-    using TileShape = cute::Shape<cute::Int<CTA_M_>, cute::Int<CTA_N_>, cute::Int<CTA_K_>>;                                                                                                                                                 \
-    using ClusterShape = cute::Shape<cute::Int<CGA_M_>, cute::Int<CGA_N_>, cute::Int<CGA_K_>>;                                                                                                                                              \
-    constexpr static bool IsMXFPX = MXFPX_;                                                                                                                                                                                                 \
-                                                                                                                                                                                                                                            \
-    if constexpr (!COMPILE_HOPPER_TMA_GROUPED_GEMMS_ENABLED &&                                                                                                                                                                              \
-                  ArchTag::kMinComputeCapability >= 90 && ArchTag::kMinComputeCapability < 100) {                                                                                                                                           \
-      TLLM_THROW(                                                                                                                                                                                                                           \
-          "Please recompile with support for hopper by passing 90-real as an arch to "                                                                                                                                                      \
-          "build_wheel.py.");                                                                                                                                                                                                               \
-    } else if constexpr (!COMPILE_BLACKWELL_TMA_GROUPED_GEMMS_ENABLED &&                                                                                                                                                                    \
-                         ArchTag::kMinComputeCapability >= 100 &&                                                                                                                                                                           \
-                         ArchTag::kMinComputeCapability < 120) {                                                                                                                                                                            \
-      TLLM_THROW(                                                                                                                                                                                                                           \
-          "Please recompile with support for blackwell by passing 100-real as an arch to "                                                                                                                                                  \
-          "build_wheel.py.");                                                                                                                                                                                                               \
-    } else if constexpr (!COMPILE_BLACKWELL_SM120_TMA_GROUPED_GEMMS_ENABLED &&                                                                                                                                                              \
-                         ArchTag::kMinComputeCapability >= 120) {                                                                                                                                                                           \
-      TLLM_THROW(                                                                                                                                                                                                                           \
-          "Please recompile with support for blackwell by passing 120-real as an arch to "                                                                                                                                                  \
-          "build_wheel.py.");                                                                                                                                                                                                               \
-    } else if constexpr (!should_filter_tma_warp_specialized_gemm_problem_shape_v<                                                                                                                                                          \
-                             ArchTag, TileShape, ClusterShape, T>) {                                                                                                                                                                        \
-      using namespace cute;                                                                                                                                                                                                                 \
-      /* Helper class for defining all the cutlass types                                                                                                                                                                                    \
-      // template <typename ArchTag, typename T, typename WeightType, typename OutputType,                                                                                                                                                  \
-      typename EpilogueTag,                                                                                                                                                                                                                 \
-      //    typename TileShape, typename ClusterShape, bool BIAS, EpilogueFusion FUSION>                                                                                                                                                    \
-      // struct TmaWarpSpecializedGroupedGemmInfo                                                                                                                                                                                           \
-      { */                                                                                                                                                                                                                                  \
-      using Arch = ArchTag;                                                                                                                                                                                                                 \
-      constexpr static bool IsBlackwell = Arch::kMinComputeCapability >= 100;                                                                                                                                                               \
-      constexpr static bool IsSM120 =                                                                                                                                                                                                       \
-          Arch::kMinComputeCapability == 120 || Arch::kMinComputeCapability == 121;                                                                                                                                                         \
-      constexpr static bool IsWFP4AFP8 = cutlass::platform::is_same<WeightType, SafeFP4>::value &&                                                                                                                                          \
-                                         cutlass::platform::is_same<T, SafeFP8>::value;                                                                                                                                                     \
-      constexpr static bool IsFP4 = cutlass::platform::is_same<T, SafeFP4>::value;                                                                                                                                                          \
-      static_assert(!IsFP4 || IsBlackwell, "FP4 is only supported by SM100");                                                                                                                                                               \
-                                                                                                                                                                                                                                            \
-      constexpr static bool IsFP8 = cutlass::platform::is_same<T, SafeFP8>::value;                                                                                                                                                          \
-                                                                                                                                                                                                                                            \
-      /* TODO Update once mixed input support is added */                                                                                                                                                                                   \
-      static_assert(cutlass::platform::is_same<T, WeightType>::value || IsWFP4AFP8,                                                                                                                                                         \
-                    "TMA warp specialized MOE implementation does not support mixed input types");                                                                                                                                          \
-                                                                                                                                                                                                                                            \
-      constexpr static bool IsBlockScaled = IsFP4 || IsWFP4AFP8;                                                                                                                                                                            \
-      static_assert(!IsBlockScaled || IsBlackwell, "Block scaled is only implemented for SM100");                                                                                                                                           \
-                                                                                                                                                                                                                                            \
-      static_assert(cutlass::platform::is_same<T, SafeBF16>::value ||                                                                                                                                                                       \
-                        cutlass::platform::is_same<T, half>::value ||                                                                                                                                                                       \
-                        cutlass::platform::is_same<T, float>::value || IsFP8 || IsFP4,                                                                                                                                                      \
-                    "Specialized for bfloat16, half, float, fp8, fp4");                                                                                                                                                                     \
-                                                                                                                                                                                                                                            \
-      /* The cutlass type for the input elements. This is needed to convert to cutlass::half_t if                                                                                                                                           \
-       * necessary.*/                                                                                                                                                                                                                       \
-      using ElementType = typename TllmToCutlassTypeAdapter<T>::type;                                                                                                                                                                       \
-                                                                                                                                                                                                                                            \
-      /* TODO The below never trigger, and are incorrect for int8 types anyway                                                                                                                                                              \
-      //    using CutlassWeightTypeMaybeUint4 = typename                                                                                                                                                                                    \
-      TllmToCutlassTypeAdapter<WeightType>::type;                                                                                                                                                                                           \
-      //    // For legacy reasons we convert unsigned 8-bit to signed                                                                                                                                                                       \
-      //    using CutlassWeightTypeMaybeUint8                                                                                                                                                                                               \
-      //        = std::conditional_t<std::is_same_v<CutlassWeightTypeMaybeUint4,                                                                                                                                                            \
-      cutlass::uint4b_t>, cutlass::int4b_t,                                                                                                                                                                                                 \
-      //            CutlassWeightTypeMaybeUint4>;                                                                                                                                                                                           \
-      //    using CutlassWeightType                                                                                                                                                                                                         \
-      //        = std::conditional_t<std::is_same_v<CutlassWeightTypeMaybeUint8, uint8_t>, int8_t,                                                                                                                                          \
-      //        CutlassWeightTypeMaybeUint8>; */                                                                                                                                                                                            \
-      using CutlassWeightType = typename TllmToCutlassTypeAdapter<WeightType>::type;                                                                                                                                                        \
-                                                                                                                                                                                                                                            \
-      using ElementA = ElementType;                                                                                                                                                                                                         \
-      using ElementB = CutlassWeightType;                                                                                                                                                                                                   \
-                                                                                                                                                                                                                                            \
-      using ElementD = typename TllmToCutlassTypeAdapter<                                                                                                                                                                                   \
-          TmaWarpSpecializedGroupedGemmInput::OutputTypeAdaptor_t<OutputType>>::type;                                                                                                                                                       \
-      using ElementFinalOutput = typename TllmToCutlassTypeAdapter<OutputType>::type;                                                                                                                                                       \
-                                                                                                                                                                                                                                            \
-      /* using ElementC = std::conditional_t<BIAS, ElementType, void>; */                                                                                                                                                                   \
-      /* using ElementCSafe = std::conditional_t<BIAS, ElementType, ElementD>; */                                                                                                                                                           \
-      using ElementC = void;                                                                                                                                                                                                                \
-      using ElementCSafe = ElementD;                                                                                                                                                                                                        \
-                                                                                                                                                                                                                                            \
-      using ElementAccumulator = float;                                                                                                                                                                                                     \
-                                                                                                                                                                                                                                            \
-      using ElementBias = ElementFinalOutput;                                                                                                                                                                                               \
-      using ElementRouterScales = float;                                                                                                                                                                                                    \
-                                                                                                                                                                                                                                            \
-      using ElementSF = std::conditional_t<                                                                                                                                                                                                 \
-          IsMXFPX, cutlass::float_ue8m0_t,                                                                                                                                                                                                  \
-          cutlass::float_ue4m3_t>; /*TmaWarpSpecializedGroupedGemmInput::ElementSF;*/                                                                                                                                                       \
-      using ElementABlockScaled = std::conditional_t<IsSM120, cutlass::nv_float4_t<ElementA>,                                                                                                                                               \
-                                                     cute::tuple<ElementA, ElementSF>>;                                                                                                                                                     \
-      using ElementBBlockScaled = std::conditional_t<IsSM120, cutlass::nv_float4_t<ElementB>,                                                                                                                                               \
-                                                     cute::tuple<ElementB, ElementSF>>;                                                                                                                                                     \
-                                                                                                                                                                                                                                            \
-      /* A matrix configuration - this is transposed and swapped with B */                                                                                                                                                                  \
-      using LayoutA = TmaWarpSpecializedGroupedGemmInput::LayoutA;                                                                                                                                                                          \
-      constexpr static int AlignmentA =                                                                                                                                                                                                     \
-          128 /                                                                                                                                                                                                                             \
-          cutlass::sizeof_bits<ElementA>::value; /* Memory access granularity/alignment of A                                                                                                                                                \
-                                                 matrix in units of elements (up to 16 bytes) */                                                                                                                                            \
-      /* B matrix configuration - this is transposed and swapped with A */                                                                                                                                                                  \
-      using LayoutB =                                                                                                                                                                                                                       \
-          TmaWarpSpecializedGroupedGemmInput::LayoutB; /* Layout type for B matrix operand */                                                                                                                                               \
-      constexpr static int AlignmentB =                                                                                                                                                                                                     \
-          IsWFP4AFP8                                                                                                                                                                                                                        \
-              ? 128                                                                                                                                                                                                                         \
-              : (128 /                                                                                                                                                                                                                      \
-                 cutlass::sizeof_bits<ElementB>::value); /* Memory access granularity/alignment of                                                                                                                                          \
-                                                        B matrix in units                                                                                                                                                                   \
-                                                        // of elements (up to 16 bytes)*/                                                                                                                                                   \
-                                                                                                                                                                                                                                            \
-      /* C matrix configuration */                                                                                                                                                                                                          \
-      using LayoutC =                                                                                                                                                                                                                       \
-          TmaWarpSpecializedGroupedGemmInput::LayoutC; /* Layout type for C matrix operand */                                                                                                                                               \
-      using StrideC = TmaWarpSpecializedGroupedGemmInput::StrideC;                                                                                                                                                                          \
-      /* Note we use ElementType here deliberately, so we don't break when BIAS is disabled */                                                                                                                                              \
-      constexpr static int AlignmentC =                                                                                                                                                                                                     \
-          128 / cutlass::sizeof_bits<ElementType>::value; /* Memory access granularity/alignment                                                                                                                                            \
-                                                          of C matrix in                                                                                                                                                                    \
-                                                          // units of elements (up to 16 bytes)*/                                                                                                                                           \
-                                                                                                                                                                                                                                            \
-      /* D matrix configuration */                                                                                                                                                                                                          \
-      using LayoutD = TmaWarpSpecializedGroupedGemmInput::DefaultEpilogue::LayoutD;                                                                                                                                                         \
-      using StrideD = TmaWarpSpecializedGroupedGemmInput::DefaultEpilogue::StrideD;                                                                                                                                                         \
-      constexpr static int AlignmentD =                                                                                                                                                                                                     \
-          128 / cutlass::sizeof_bits<ElementD>::value; /* Memory access granularity/alignment of D                                                                                                                                          \
-                                                       matrix                                                                                                                                                                               \
-                                                       // in units of elements (up to 16 bytes) */                                                                                                                                          \
-                                                                                                                                                                                                                                            \
-      static_assert(                                                                                                                                                                                                                        \
-          cutlass::platform::is_same<EpilogueTag,                                                                                                                                                                                           \
-                                     tensorrt_llm::cutlass_extensions::EpilogueOpDefault>::value,                                                                                                                                           \
-          "TMA Warp Specialized Grouped GEMM specialisation doesn't support fused activation");                                                                                                                                             \
-                                                                                                                                                                                                                                            \
-      using EpilogueOp =                                                                                                                                                                                                                    \
-          cutlass::epilogue::fusion::LinearCombination<ElementD, ElementAccumulator, ElementC,                                                                                                                                              \
-                                                       ElementAccumulator>;                                                                                                                                                                 \
-                                                                                                                                                                                                                                            \
-      /* TODO Add mode for fused activation once CUTLASS adds support                                                                                                                                                                       \
-      //  using EpilogueSchedule = cutlass::platform::conditional_t<                                                                                                                                                                        \
-      //        cutlass::platform::is_same<EpilogueOp, EpilogueOpDefault>::value,                                                                                                                                                           \
-      //        cutlass::epilogue::PtrArrayNoSmemWarpSpecialized,                                                                                                                                                                           \
-      //        cutlass::epilogue::??????????????????             /// <<<<<< what supports                                                                                                                                                  \
-      activations                                                                                                                                                                                                                           \
-      //        >;*/                                                                                                                                                                                                                        \
-      using EpilogueScheduleSM90 = cutlass::epilogue::PtrArrayNoSmemWarpSpecialized;                                                                                                                                                        \
-                                                                                                                                                                                                                                            \
-      constexpr static bool Is2SM = IsBlackwell && (cute::size<0>(ClusterShape{}) % 2) == 0;                                                                                                                                                \
-      using EpilogueScheduleSM100 =                                                                                                                                                                                                         \
-          std::conditional_t<Is2SM, cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm,                                                                                                                                                       \
-                             cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm>;                                                                                                                                                             \
-      using EpilogueScheduleSM120 = cutlass::epilogue::TmaWarpSpecialized;                                                                                                                                                                  \
-      using EpilogueScheduleBW =                                                                                                                                                                                                            \
-          std ::conditional_t<IsSM120, EpilogueScheduleSM120, EpilogueScheduleSM100>;                                                                                                                                                       \
-      using EpilogueSchedule =                                                                                                                                                                                                              \
-          std::conditional_t<IsBlackwell, EpilogueScheduleBW, EpilogueScheduleSM90>;                                                                                                                                                        \
-                                                                                                                                                                                                                                            \
-      using EpilogueTileShapeSm90 = TileShape;                                                                                                                                                                                              \
-      using AtomClusterDiv = std::conditional_t<Is2SM, _2, _1>;                                                                                                                                                                             \
-      using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<AtomClusterDiv, _1, _1>{}));                                                                                                                                            \
-      using EpilogueTileShapeSm100 = decltype(shape_div(TileShape{}, AtomThrShape{}));                                                                                                                                                      \
-      using EpilogueTileShape =                                                                                                                                                                                                             \
-          std::conditional_t<IsBlackwell, EpilogueTileShapeSm100, EpilogueTileShapeSm90>;                                                                                                                                                   \
-      using EpilogueElementC = std::conditional_t<IsSM120, ElementCSafe, ElementC>;                                                                                                                                                         \
-      using EpilogueTensorOp = std::conditional_t<IsBlackwell && IsBlockScaled,                                                                                                                                                             \
-                                                  cutlass::arch::OpClassBlockScaledTensorOp,                                                                                                                                                \
-                                                  cutlass::arch::OpClassTensorOp>;                                                                                                                                                          \
-      using EpilogueSubTile = std::conditional_t<                                                                                                                                                                                           \
-          Arch::kMinComputeCapability == 100 && IsFP4 && CTA_N_ == 256, /* SM100 Exactly */                                                                                                                                                 \
-          cute::Shape<cute::_128, cute::_64>, cutlass::epilogue::collective::EpilogueTileAuto>;                                                                                                                                             \
-      /* Epilogue For Default Finalize */                                                                                                                                                                                                   \
-      using CollectiveEpilogueDefault = typename cutlass::epilogue::collective::                                                                                                                                                            \
-          CollectiveBuilder<                                        /**/                                                                                                                                                                    \
-                            Arch, EpilogueTensorOp,                 /**/                                                                                                                                                                    \
-                            EpilogueTileShape, ClusterShape,        /**/                                                                                                                                                                    \
-                            EpilogueSubTile,                        /**/                                                                                                                                                                    \
-                            ElementAccumulator, ElementAccumulator, /**/                                                                                                                                                                    \
-                            EpilogueElementC, LayoutC*, AlignmentC, /**/                                                                                                                                                                    \
-                            ElementD, LayoutD*, AlignmentD,         /**/                                                                                                                                                                    \
-                            EpilogueSchedule>::CollectiveOp;                                                                                                                                                                                \
-                                                                                                                                                                                                                                            \
-      /* Epilogue For Fused Finalize */                                                                                                                                                                                                     \
-      using CollectiveEpilogueFinalize = typename cutlass::epilogue::collective::                                                                                                                                                           \
-          EpilogueMoeFusedFinalizeBuilder<                         /**/                                                                                                                                                                     \
-                                          Arch, EpilogueTileShape, /**/                                                                                                                                                                     \
-                                          ElementCSafe, StrideC*,  /**/                                                                                                                                                                     \
-                                          ElementFinalOutput,                                                                                                                                                                               \
-                                          TmaWarpSpecializedGroupedGemmInput::                                                                                                                                                              \
-                                              FusedFinalizeEpilogue::StrideFinalOutput, /**/                                                                                                                                                \
-                                          ElementAccumulator,                           /**/                                                                                                                                                \
-                                          ElementAccumulator,                           /**/                                                                                                                                                \
-                                          ElementBias,                                                                                                                                                                                      \
-                                          TmaWarpSpecializedGroupedGemmInput::                                                                                                                                                              \
-                                              FusedFinalizeEpilogue::StrideBias, /**/                                                                                                                                                       \
-                                          ElementRouterScales,                                                                                                                                                                              \
-                                          TmaWarpSpecializedGroupedGemmInput::                                                                                                                                                              \
-                                              FusedFinalizeEpilogue::StrideRouterScales /**/                                                                                                                                                \
-                                          >::CollectiveOp;                                                                                                                                                                                  \
-                                                                                                                                                                                                                                            \
-      using CollectiveEpilogue =                                                                                                                                                                                                            \
-          std::conditional_t<FUSION == EpilogueFusion::FINALIZE, CollectiveEpilogueFinalize,                                                                                                                                                \
-                             CollectiveEpilogueDefault>;                                                                                                                                                                                    \
-                                                                                                                                                                                                                                            \
-      using StageCountAutoCarveout =                                                                                                                                                                                                        \
-          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(                                                                                                                                                               \
-              sizeof(typename CollectiveEpilogue::SharedStorage))>;                                                                                                                                                                         \
-                                                                                                                                                                                                                                            \
-      using KernelScheduleSM90 = std::conditional_t<                                                                                                                                                                                        \
-          IsFP8, cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum,                                                                                                                                                    \
-          cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative>;                                                                                                                                                                      \
-                                                                                                                                                                                                                                            \
-      using KernelSchedule2SmSm100BlockScaled =                                                                                                                                                                                             \
-          std::conditional_t<IsMXFPX,                                                                                                                                                                                                       \
-                             cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100,                                                                                                                                               \
-                             cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmNvf4Sm100>;                                                                                                                                                  \
-      using KernelSchedule1SmSm100BlockScaled =                                                                                                                                                                                             \
-          std::conditional_t<IsMXFPX,                                                                                                                                                                                                       \
-                             cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100,                                                                                                                                               \
-                             cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmNvf4Sm100>;                                                                                                                                                  \
-                                                                                                                                                                                                                                            \
-      /* TRT-LLM uses vector size 16 for block scaled */                                                                                                                                                                                    \
-      using KernelScheduleSM100 = std::conditional_t<                                                                                                                                                                                       \
-          Is2SM,                                                                                                                                                                                                                            \
-          std::conditional_t<IsBlockScaled, KernelSchedule2SmSm100BlockScaled,                                                                                                                                                              \
-                             cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100>,                                                                                                                                                      \
-          std::conditional_t<IsBlockScaled, KernelSchedule1SmSm100BlockScaled,                                                                                                                                                              \
-                             cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100>>;                                                                                                                                                     \
-      using KernelScheduleSM120 = cutlass ::gemm ::collective::KernelScheduleAuto;                                                                                                                                                          \
-      using KernelScheduleBW =                                                                                                                                                                                                              \
-          std::conditional_t<IsSM120, KernelScheduleSM120, KernelScheduleSM100>;                                                                                                                                                            \
-                                                                                                                                                                                                                                            \
-      using KernelSchedule =                                                                                                                                                                                                                \
-          std::conditional_t<IsBlackwell, KernelScheduleBW, KernelScheduleSM90>;                                                                                                                                                            \
-                                                                                                                                                                                                                                            \
-      using TensorOp = std::conditional_t<IsBlackwell && IsBlockScaled,                                                                                                                                                                     \
-                                          cutlass::arch::OpClassBlockScaledTensorOp,                                                                                                                                                        \
-                                          cutlass::arch::OpClassTensorOp>;                                                                                                                                                                  \
-                                                                                                                                                                                                                                            \
-      using MainloopElementA =                                                                                                                                                                                                              \
-          std::conditional_t<IsBlackwell && IsBlockScaled, ElementABlockScaled, ElementA>;                                                                                                                                                  \
-      using MainloopElementB =                                                                                                                                                                                                              \
-          std::conditional_t<IsBlackwell && IsBlockScaled, ElementBBlockScaled, ElementB>;                                                                                                                                                  \
-                                                                                                                                                                                                                                            \
-      using MainloopTileShapeSm90 = TileShape;                                                                                                                                                                                              \
-      using MainloopTileShapeSm100 = decltype(shape_div(TileShape{}, AtomThrShape{}));                                                                                                                                                      \
-      using MainloopTileShape =                                                                                                                                                                                                             \
-          std::conditional_t<IsBlackwell, MainloopTileShapeSm100, MainloopTileShapeSm90>;                                                                                                                                                   \
-                                                                                                                                                                                                                                            \
-      using CollectiveMainloop = typename cutlass::gemm::collective::                                                                                                                                                                       \
-          CollectiveBuilder<                                        /**/                                                                                                                                                                    \
-                            Arch, TensorOp,                         /**/                                                                                                                                                                    \
-                            MainloopElementB, LayoutB*, AlignmentB, /* A & B swapped here */                                                                                                                                                \
-                            MainloopElementA, LayoutA*, AlignmentA, /**/                                                                                                                                                                    \
-                            ElementAccumulator,                     /**/                                                                                                                                                                    \
-                            MainloopTileShape, ClusterShape,        /**/                                                                                                                                                                    \
-                            StageCountAutoCarveout, KernelSchedule>::CollectiveOp;                                                                                                                                                          \
-                                                                                                                                                                                                                                            \
-      using GemmKernel =                                                                                                                                                                                                                    \
-          cutlass::gemm::kernel::GemmUniversal<TmaWarpSpecializedGroupedGemmInput::ProblemShape,                                                                                                                                            \
-                                               CollectiveMainloop, CollectiveEpilogue, void,                                                                                                                                                \
-                                               void>;                                                                                                                                                                                       \
-                                                                                                                                                                                                                                            \
-      using GemmGrouped = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;                                                                                                                                                          \
-      /*};                                                                                                                                                                                                                                  \
-                                                                                                                                                                                     \                                                      \
-      //        using namespace cute;                                                                                                                                                                                                       \
-      //        using GemmInfo = TmaWarpSpecializedGroupedGemmInfo;<ArchTag, T, WeightType,                                                                                                                                                 \
-      OutputType, EpilogueTag,                                                                                                                                                                                                              \
-      //        TileShape,                                                                                                                                                                                                                  \
-      //            ClusterShape, BIAS, FUSION>;                                                                                                                                                                                            \
-      //                                                                                                                                                                                                                                    \
-      //        using ElementAccumulator = typename GemmInfo::ElementAccumulator;                                                                                                                                                           \
-      //        using ElementA = typename GemmInfo::ElementA;                                                                                                                                                                               \
-      //        using ElementB = typename GemmInfo::ElementB;                                                                                                                                                                               \
-      //        using ElementC = typename GemmInfo::ElementC;                                                                                                                                                                               \
-      //        using ElementCSafe = typename GemmInfo::ElementCSafe;                                                                                                                                                                       \
-      //        using ElementD = typename GemmInfo::ElementD;                                                                                                                                                                               \
-      //        using ElementFinalOutput = typename GemmInfo::ElementFinalOutput;                                                                                                                                                           \
-      //        using ElementBias = typename GemmInfo::ElementBias;                                                                                                                                                                         \
-      //                                                                                                                                                                                                                                    \
-      //        using CollectiveMainloop = typename GemmInfo::CollectiveMainloop;                                                                                                                                                           \
-      //        using CollectiveEpilogue = typename GemmInfo::CollectiveEpilogue;                                                                                                                                                           \
-      //        using GemmKernel = typename GemmInfo::GemmKernel;                                                                                                                                                                           \
-      //        using GemmGrouped = typename GemmInfo::GemmGrouped;*/                                                                                                                                                                       \
-                                                                                                                                                                                                                                            \
-      if (kernel_occupancy != nullptr) {                                                                                                                                                                                                    \
-        TLLM_THROW("TMA WS kernels do not support calculating occupancy");                                                                                                                                                                  \
-        return;                                                                                                                                                                                                                             \
-      }                                                                                                                                                                                                                                     \
-                                                                                                                                                                                                                                            \
-      cutlass::KernelHardwareInfo hw_info;                                                                                                                                                                                                  \
-      hw_info.device_id = 0;                                                                                                                                                                                                                \
-      hw_info.sm_count = multi_processor_count;                                                                                                                                                                                             \
-                                                                                                                                                                                                                                            \
-      GemmGrouped gemm;                                                                                                                                                                                                                     \
-                                                                                                                                                                                                                                            \
-      if (workspace_size != nullptr) {                                                                                                                                                                                                      \
-        /* Make a mock problem shape with just the minimal information actually required to get                                                                                                                                             \
-        the workspace                                                                                                                                                                                                                       \
-        // size This makes some assumptions about CUTLASS's implementation which is suboptimal. We                                                                                                                                          \
-        have a check                                                                                                                                                                                                                        \
-        // later to catch future cutlass updates causing silent breakages, but that is not fool                                                                                                                                             \
-        proof. The                                                                                                                                                                                                                          \
-        // alternative is to wait until we have data and then dynamically allocate the workspace*/                                                                                                                                          \
-        typename TmaWarpSpecializedGroupedGemmInput::ProblemShape shape_info{num_experts, nullptr,                                                                                                                                          \
-                                                                             nullptr};                                                                                                                                                      \
-                                                                                                                                                                                                                                            \
-        typename GemmKernel::TileScheduler::Arguments scheduler_args{                                                                                                                                                                       \
-            1, GemmKernel::TileScheduler::RasterOrderOptions::AlongN};                                                                                                                                                                      \
-        const typename GemmGrouped::Arguments args{cutlass::gemm::GemmUniversalMode::kGrouped,                                                                                                                                              \
-                                                   shape_info,                                                                                                                                                                              \
-                                                   {},                                                                                                                                                                                      \
-                                                   {},                                                                                                                                                                                      \
-                                                   hw_info,                                                                                                                                                                                 \
-                                                   scheduler_args};                                                                                                                                                                         \
-        *workspace_size = gemm.get_workspace_size(args);                                                                                                                                                                                    \
-        return;                                                                                                                                                                                                                             \
-      }                                                                                                                                                                                                                                     \
-                                                                                                                                                                                                                                            \
-      using MainloopArguments = typename CollectiveMainloop::Arguments;                                                                                                                                                                     \
-      TLLM_CHECK(tma_ws_input.stride_a);                                                                                                                                                                                                    \
-      TLLM_CHECK(tma_ws_input.stride_b);                                                                                                                                                                                                    \
-      TLLM_CHECK(tma_ws_input.ptr_a);                                                                                                                                                                                                       \
-      TLLM_CHECK(tma_ws_input.ptr_b);                                                                                                                                                                                                       \
-                                                                                                                                                                                                                                            \
-      auto make_mainloop_params = [&]() -> MainloopArguments {                                                                                                                                                                              \
-        if constexpr (IsBlockScaled) {                                                                                                                                                                                                      \
-          return construct_if_true<IsBlockScaled, MainloopArguments>(                                                                                                                                                                       \
-              reinterpret_cast<ElementB const**>(tma_ws_input.ptr_b), tma_ws_input.stride_b,                                                                                                                                                \
-              reinterpret_cast<ElementA const**>(tma_ws_input.ptr_a), tma_ws_input.stride_a,                                                                                                                                                \
-              reinterpret_cast<ElementSF const**>(tma_ws_input.fpX_block_scaling_factors_B),                                                                                                                                                \
-              reinterpret_cast<decltype(deduce_layout_sf<IsBlockScaled, GemmGrouped, false>())>(                                                                                                                                            \
-                  tma_ws_input.fpX_block_scaling_factors_stride_B),                                                                                                                                                                         \
-              reinterpret_cast<ElementSF const**>(tma_ws_input.fpX_block_scaling_factors_A),                                                                                                                                                \
-              reinterpret_cast<decltype(deduce_layout_sf<IsBlockScaled, GemmGrouped, true>())>(                                                                                                                                             \
-                  tma_ws_input.fpX_block_scaling_factors_stride_A));                                                                                                                                                                        \
-        } else {                                                                                                                                                                                                                            \
-          return construct_if_true<!IsBlockScaled, MainloopArguments>(                                                                                                                                                                      \
-              reinterpret_cast<ElementB const**>(tma_ws_input.ptr_b), tma_ws_input.stride_b,                                                                                                                                                \
-              reinterpret_cast<ElementA const**>(tma_ws_input.ptr_a), tma_ws_input.stride_a);                                                                                                                                               \
-        }                                                                                                                                                                                                                                   \
-      };                                                                                                                                                                                                                                    \
-                                                                                                                                                                                                                                            \
-      auto const mainloop_params = make_mainloop_params();                                                                                                                                                                                  \
-                                                                                                                                                                                                                                            \
-      using EpilogueArguments = typename CollectiveEpilogue::Arguments;                                                                                                                                                                     \
-      using EpilogueScalars = decltype(EpilogueArguments{}.thread);                                                                                                                                                                         \
-      auto make_epilogue_scalars = [&]() {                                                                                                                                                                                                  \
-        if constexpr (IsBlackwell) {                                                                                                                                                                                                        \
-          return construct_if_true<IsBlackwell, EpilogueScalars>(                                                                                                                                                                           \
-              ElementAccumulator(1.f),                                                                                                                                                                                                      \
-              tma_ws_input.ptr_c ? ElementAccumulator(1.f) : ElementAccumulator(0.f), nullptr,                                                                                                                                              \
-              nullptr, tma_ws_input.alpha_scale_ptr_array, nullptr,                                                                                                                                                                         \
-              cute::Shape<_0, _0, int64_t>{                                                                                                                                                                                                 \
-                  cute::_0{}, cute::_0{},                                                                                                                                                                                                   \
-                  (tma_ws_input.alpha_scale_ptr_array != nullptr) ? 1 : 0},                                                                                                                                                                 \
-              cute::Shape<_0, _0, int64_t>{cute::_0{}, cute::_0{}, 0});                                                                                                                                                                     \
-        } else if (tma_ws_input.alpha_scale_ptr_array) {                                                                                                                                                                                    \
-          return construct_if_true<!IsBlackwell, EpilogueScalars>(                                                                                                                                                                          \
-              tma_ws_input.alpha_scale_ptr_array);                                                                                                                                                                                          \
-        } else {                                                                                                                                                                                                                            \
-          return construct_if_true<!IsBlackwell, EpilogueScalars>(                                                                                                                                                                          \
-              ElementAccumulator(1.f),                                                                                                                                                                                                      \
-              tma_ws_input.ptr_c ? ElementAccumulator(1.f) : ElementAccumulator(0.f));                                                                                                                                                      \
-        }                                                                                                                                                                                                                                   \
-      };                                                                                                                                                                                                                                    \
-      auto epilogue_scalars = make_epilogue_scalars();                                                                                                                                                                                      \
-      /* TODO ptr_c casts to ElementCSafe** because there is a workaround in CUTLASS */                                                                                                                                                     \
-      auto make_epi_args = [&]() {                                                                                                                                                                                                          \
-        static_assert(FUSION == EpilogueFusion::NONE || FUSION == EpilogueFusion::FINALIZE,                                                                                                                                                 \
-                      "Unimplemented fusion provided to TMA WS MoE gemm launcher");                                                                                                                                                         \
-                                                                                                                                                                                                                                            \
-        if constexpr (FUSION == EpilogueFusion::NONE) {                                                                                                                                                                                     \
-          auto epi_params = tma_ws_input.default_epilogue;                                                                                                                                                                                  \
-          return construct_if_true < FUSION == EpilogueFusion::NONE,                                                                                                                                                                        \
-                 EpilogueArguments > (epilogue_scalars, nullptr, tma_ws_input.stride_c,                                                                                                                                                     \
-                                      reinterpret_cast<ElementD**>(epi_params.ptr_d),                                                                                                                                                       \
-                                      epi_params.stride_d);                                                                                                                                                                                 \
-        } else if constexpr (FUSION == EpilogueFusion::FINALIZE) {                                                                                                                                                                          \
-          /* Parameters for fused finalize */                                                                                                                                                                                               \
-          auto epi_params = tma_ws_input.fused_finalize_epilogue;                                                                                                                                                                           \
-          return construct_if_true < FUSION == EpilogueFusion::FINALIZE,                                                                                                                                                                    \
-                 EpilogueArguments >                                                                                                                                                                                                        \
-                     (epilogue_scalars,               /* Parameters to underlying epilogue */                                                                                                                                               \
-                      nullptr, tma_ws_input.stride_c, /* C params */                                                                                                                                                                        \
-                      reinterpret_cast<ElementFinalOutput*>(epi_params.ptr_final_output),                                                                                                                                                   \
-                      epi_params.stride_final_output, /* D (output) params */                                                                                                                                                               \
-                      reinterpret_cast<ElementBias const*>(epi_params.ptr_bias),                                                                                                                                                            \
-                      epi_params.stride_bias, /* Bias params */                                                                                                                                                                             \
-                      epi_params.ptr_router_scales,                                                                                                                                                                                         \
-                      epi_params.stride_router_scales,          /* Router scales */                                                                                                                                                         \
-                      epi_params.ptr_expert_first_token_offset, /* Offset of this expert's token                                                                                                                                            \
-                                                                   in the router scales */                                                                                                                                                  \
-                      epi_params                                                                                                                                                                                                            \
-                          .ptr_source_token_index, /* Index of the source token to sum into */                                                                                                                                              \
-                      epi_params                                                                                                                                                                                                            \
-                          .num_rows_in_final_output /* Number of tokens in the output buffer */                                                                                                                                             \
-                     );                                                                                                                                                                                                                     \
-        }                                                                                                                                                                                                                                   \
-      };                                                                                                                                                                                                                                    \
-      EpilogueArguments const epilogue_params = make_epi_args();                                                                                                                                                                            \
-      /*        EpilogueArguments const epilogue_params = make_epi_args<EpilogueArguments,                                                                                                                                                  \
-      EpilogueScalars, ElementCSafe, ElementD, ElementFinalOutput, ElementBias, FUSION>(                                                                                                                                                    \
-      //            tma_ws_input, epilogue_scalars                                                                                                                                                                                          \
-      //        );*/                                                                                                                                                                                                                        \
-                                                                                                                                                                                                                                            \
-      typename GemmKernel::TileScheduler::Arguments scheduler_args{                                                                                                                                                                         \
-          1, GemmKernel::TileScheduler::RasterOrderOptions::AlongN};                                                                                                                                                                        \
-                                                                                                                                                                                                                                            \
-      const typename GemmGrouped::Arguments args{cutlass::gemm::GemmUniversalMode::kGrouped,                                                                                                                                                \
-                                                 tma_ws_input.shape_info,                                                                                                                                                                   \
-                                                 mainloop_params,                                                                                                                                                                           \
-                                                 epilogue_params,                                                                                                                                                                           \
-                                                 hw_info,                                                                                                                                                                                   \
-                                                 scheduler_args};                                                                                                                                                                           \
-                                                                                                                                                                                                                                            \
-      size_t calculated_ws_size = gemm.get_workspace_size(args);                                                                                                                                                                            \
-      TLLM_CHECK_WITH_INFO(calculated_ws_size <= tma_ws_input.gemm_workspace_size,                                                                                                                                                          \
-                           "Workspace is size %zu but only %zu were allocated",                                                                                                                                                             \
-                           calculated_ws_size, tma_ws_input.gemm_workspace_size);                                                                                                                                                           \
-                                                                                                                                                                                                                                            \
-      auto can_implement = gemm.can_implement(args);                                                                                                                                                                                        \
-      TLLM_CHECK_WITH_INFO(can_implement == cutlass::Status::kSuccess,                                                                                                                                                                      \
-                           "Grouped GEMM kernel will fail for params. Error: " +                                                                                                                                                            \
-                               std::string(cutlass::cutlassGetStatusString(can_implement)));                                                                                                                                                \
-                                                                                                                                                                                                                                            \
-      auto init_status = gemm.initialize(args, tma_ws_input.gemm_workspace);                                                                                                                                                                \
-      TLLM_CHECK_WITH_INFO(init_status == cutlass::Status::kSuccess,                                                                                                                                                                        \
-                           "Failed to initialize cutlass TMA WS grouped gemm. Error: " +                                                                                                                                                    \
-                               std::string(cutlass::cutlassGetStatusString(init_status)));                                                                                                                                                  \
-      auto run_status = gemm.run(stream, nullptr, tma_ws_input.enable_pdl);                                                                                                                                                                 \
-      TLLM_CHECK_WITH_INFO(run_status == cutlass::Status::kSuccess,                                                                                                                                                                         \
-                           "Failed to run cutlass TMA WS grouped gemm. Error: " +                                                                                                                                                           \
-                               std::string(cutlass::cutlassGetStatusString(run_status)));                                                                                                                                                   \
-      sync_check_cuda_error(stream);                                                                                                                                                                                                        \
-    } else {                                                                                                                                                                                                                                \
-      TLLM_THROW("Configuration was disabled by FAST_BUILD");                                                                                                                                                                               \
-    }                                                                                                                                                                                                                                       \
-                                                                                                                                                                                                                                            \
-    return;                                                                                                                                                                                                                                 \
-  }                                                                                                                                                                                                                                         \
-                                                                                                                                                                                                                                            \
-  template <>                                                                                                                                                                                                                               \
-  struct DispatchToTmaWSFunction<                                                                                                                                                                                                           \
-      cutlass::arch::ArchTag_, DataType_, WeightType_, OutputType_,                                                                                                                                                                         \
-      tensorrt_llm::cutlass_extensions::EpilogueTag_, EpilogueFusion::FUSION_,                                                                                                                                                              \
-      cute::Shape<cute::Int<CTA_M_>, cute::Int<CTA_N_>, cute::Int<CTA_K_>>,                                                                                                                                                                 \
-      cute::Shape<cute::Int<CGA_M_>, cute::Int<CGA_N_>, cute::Int<CGA_K_>>, MXFPX_, BIAS_> {                                                                                                                                                \
-    constexpr static auto* op =                                                                                                                                                                                                             \
-        &tma_warp_specialized_generic_moe_gemm_kernelLauncher_##ArchTag_##_##DataType_##_##WeightType_##_##OutputType_##_##EpilogueTag_##_##FUSION_##_##CTA_M_##_##CTA_N_##_##CTA_K_##_##CGA_M_##_##CGA_N_##_##CGA_K_##_##MXFPX_##_##BIAS_; \
-  };                                                                                                                                                                                                                                        \
-  template void tma_warp_specialized_generic_moe_gemm_kernelLauncher<                                                                                                                                                                       \
-      cutlass::arch::ArchTag_, DataType_, WeightType_, OutputType_,                                                                                                                                                                         \
-      tensorrt_llm::cutlass_extensions::EpilogueTag_, EpilogueFusion::FUSION_,                                                                                                                                                              \
-      cute::Shape<cute::Int<CTA_M_>, cute::Int<CTA_N_>, cute::Int<CTA_K_>>,                                                                                                                                                                 \
-      cute::Shape<cute::Int<CGA_M_>, cute::Int<CGA_N_>, cute::Int<CGA_K_>>, MXFPX_, BIAS_>(                                                                                                                                                 \
-      TmaWarpSpecializedGroupedGemmInput tma_ws_input, int num_experts,                                                                                                                                                                     \
-      int const multi_processor_count, cudaStream_t stream, int* kernel_occupancy,                                                                                                                                                          \
-      size_t* workspace_size);
+#define INSTANTIATE_TMA_WARP_SPECIALIZED_MOE_GEMM(                                                                                                                                                                                                                                                                      \
+    ArchTag_, DataType_, WeightType_, OutputType_, EpilogueSchedule_, EpilogueTag_, FUSION_,                                                                                                                                                                                                                            \
+    CTA_M_, CTA_N_, CTA_K_, CGA_M_, CGA_N_, CGA_K_, MXFPX_, DYNAMIC_CGA_, BIAS_, SWAP_AB_)                                                                                                                                                                                                                              \
+  static void                                                                                                                                                                                                                                                                                                           \
+      tma_warp_specialized_generic_moe_gemm_kernelLauncher_##ArchTag_##_##DataType_##_##WeightType_##_##OutputType_##_##EpilogueSchedule_##_##EpilogueTag_##_##FUSION_##_##CTA_M_##_##CTA_N_##_##CTA_K_##_##CGA_M_##_##CGA_N_##_##CGA_K_##_##MXFPX_##_##DYNAMIC_CGA_##_##BIAS_##_##SWAP_AB_(                            \
+          TmaWarpSpecializedGroupedGemmInput tma_ws_input, int num_experts,                                                                                                                                                                                                                                             \
+          int const multi_processor_count, cudaStream_t stream, int* kernel_occupancy,                                                                                                                                                                                                                                  \
+          size_t* workspace_size, cute::Shape<int32_t, int32_t, cute::_1> dynamic_cluster_shape,                                                                                                                                                                                                                        \
+          cute::Shape<int32_t, int32_t, cute::_1> fallback_cluster_shape) {                                                                                                                                                                                                                                             \
+    using ArchTag = cutlass::arch::ArchTag_;                                                                                                                                                                                                                                                                            \
+    constexpr static EpilogueFusion FUSION = EpilogueFusion::FUSION_;                                                                                                                                                                                                                                                   \
+    constexpr static bool IsMXFPX = MXFPX_;                                                                                                                                                                                                                                                                             \
+    constexpr static bool DYNAMIC_CGA = DYNAMIC_CGA_;                                                                                                                                                                                                                                                                   \
+    constexpr static bool SwapAB = SWAP_AB_;                                                                                                                                                                                                                                                                            \
+    constexpr bool IsBlackwell = ArchTag::kMinComputeCapability >= 100;                                                                                                                                                                                                                                                 \
+    constexpr static bool IsSM10x =                                                                                                                                                                                                                                                                                     \
+        ArchTag::kMinComputeCapability >= 100 && ArchTag::kMinComputeCapability < 120;                                                                                                                                                                                                                                  \
+    constexpr static bool IsSM103 = ArchTag::kMinComputeCapability == 103;                                                                                                                                                                                                                                              \
+    constexpr bool IsSM120 =                                                                                                                                                                                                                                                                                            \
+        ArchTag::kMinComputeCapability == 120 || ArchTag::kMinComputeCapability == 121;                                                                                                                                                                                                                                 \
+    /* constexpr static bool BIAS = BIAS_; */ /* Always false */                                                                                                                                                                                                                                                        \
+    using T = DataType_;                                                                                                                                                                                                                                                                                                \
+    using WeightType = WeightType_;                                                                                                                                                                                                                                                                                     \
+    using OutputType = OutputType_;                                                                                                                                                                                                                                                                                     \
+    using EpilogueTag = tensorrt_llm::cutlass_extensions::EpilogueTag_;                                                                                                                                                                                                                                                 \
+    using InputClusterShape =                                                                                                                                                                                                                                                                                           \
+        cute::Shape<cute::Int<CGA_M_>, cute::Int<CGA_N_>, cute::Int<CGA_K_>>;                                                                                                                                                                                                                                           \
+    constexpr static bool Is2SM = IsSM10x && cute::size<0>(InputClusterShape{}) == 2;                                                                                                                                                                                                                                   \
+    using ClusterShape = std::conditional_t<DYNAMIC_CGA, cute::Shape<int32_t, int32_t, cute::_1>,                                                                                                                                                                                                                       \
+                                            InputClusterShape>;                                                                                                                                                                                                                                                         \
+    using MmaTileShape = cute::Shape<cute::Int<CTA_M_*(Is2SM ? 2 : 1)>, cute::Int<CTA_N_>,                                                                                                                                                                                                                              \
+                                     cute::Int<CTA_K_*(IsSM103 ? 3 : 1)>>;                                                                                                                                                                                                                                              \
+    using InputEpilogueSchedule = EpilogueSchedule_;                                                                                                                                                                                                                                                                    \
+    if constexpr (!COMPILE_HOPPER_TMA_GROUPED_GEMMS_ENABLED &&                                                                                                                                                                                                                                                          \
+                  ArchTag::kMinComputeCapability >= 90 && ArchTag::kMinComputeCapability < 100) {                                                                                                                                                                                                                       \
+      TLLM_THROW(                                                                                                                                                                                                                                                                                                       \
+          "Please recompile with support for hopper by passing 90-real as an arch to "                                                                                                                                                                                                                                  \
+          "build_wheel.py.");                                                                                                                                                                                                                                                                                           \
+    } else if constexpr (!COMPILE_BLACKWELL_TMA_GROUPED_GEMMS_ENABLED &&                                                                                                                                                                                                                                                \
+                         ArchTag::kMinComputeCapability >= 100 &&                                                                                                                                                                                                                                                       \
+                         ArchTag::kMinComputeCapability < 120) {                                                                                                                                                                                                                                                        \
+      TLLM_THROW(                                                                                                                                                                                                                                                                                                       \
+          "Please recompile with support for blackwell by passing 100-real as an arch to "                                                                                                                                                                                                                              \
+          "build_wheel.py.");                                                                                                                                                                                                                                                                                           \
+    } else if constexpr (!COMPILE_BLACKWELL_SM120_TMA_GROUPED_GEMMS_ENABLED &&                                                                                                                                                                                                                                          \
+                         ArchTag::kMinComputeCapability >= 120) {                                                                                                                                                                                                                                                       \
+      TLLM_THROW(                                                                                                                                                                                                                                                                                                       \
+          "Please recompile with support for blackwell by passing 120-real as an arch to "                                                                                                                                                                                                                              \
+          "build_wheel.py.");                                                                                                                                                                                                                                                                                           \
+    } else if constexpr (!should_filter_tma_warp_specialized_gemm_problem_shape_v<                                                                                                                                                                                                                                      \
+                             ArchTag, MmaTileShape, ClusterShape, DYNAMIC_CGA, T>) {                                                                                                                                                                                                                                    \
+      TLLM_CHECK_WITH_INFO(SwapAB == tma_ws_input.swap_ab, "SwapAB must match runtime swap_ab");                                                                                                                                                                                                                        \
+      using namespace cute;                                                                                                                                                                                                                                                                                             \
+      /* Helper class for defining all the cutlass types                                                                                                                                                                                                                                                                \
+      // template <typename ArchTag, typename T, typename WeightType, typename OutputType,                                                                                                                                                                                                                              \
+      typename EpilogueTag,                                                                                                                                                                                                                                                                                             \
+      //    typename MmaTileShape, typename ClusterShape, bool BIAS, EpilogueFusion FUSION>                                                                                                                                                                                                                             \
+      // struct TmaWarpSpecializedGroupedGemmInfo                                                                                                                                                                                                                                                                       \
+      { */                                                                                                                                                                                                                                                                                                              \
+      constexpr static bool IsWFP4AFP8 = cutlass::platform::is_same<WeightType, SafeFP4>::value &&                                                                                                                                                                                                                      \
+                                         cutlass::platform::is_same<T, SafeFP8>::value;                                                                                                                                                                                                                                 \
+      constexpr static bool IsFP4 = cutlass::platform::is_same<T, SafeFP4>::value;                                                                                                                                                                                                                                      \
+      static_assert(!IsFP4 || IsBlackwell, "FP4 is only supported by SM100");                                                                                                                                                                                                                                           \
+                                                                                                                                                                                                                                                                                                                        \
+      constexpr static bool IsFP8 = cutlass::platform::is_same<T, SafeFP8>::value;                                                                                                                                                                                                                                      \
+                                                                                                                                                                                                                                                                                                                        \
+      /* TODO Update once mixed input support is added */                                                                                                                                                                                                                                                               \
+      static_assert(cutlass::platform::is_same<T, WeightType>::value || IsWFP4AFP8,                                                                                                                                                                                                                                     \
+                    "TMA warp specialized MOE implementation does not support mixed input types");                                                                                                                                                                                                                      \
+                                                                                                                                                                                                                                                                                                                        \
+      constexpr static bool IsBlockScaled = IsFP4 || IsWFP4AFP8;                                                                                                                                                                                                                                                        \
+      static_assert(!IsBlockScaled || IsBlackwell, "Block scaled is only implemented for SM100");                                                                                                                                                                                                                       \
+                                                                                                                                                                                                                                                                                                                        \
+      static_assert(FUSION == EpilogueFusion::NONE || FUSION == EpilogueFusion::FINALIZE,                                                                                                                                                                                                                               \
+                    "Unimplemented fusion provided to TMA WS MoE gemm launcher");                                                                                                                                                                                                                                       \
+      constexpr static bool IsFinalizeFusion = FUSION == EpilogueFusion::FINALIZE;                                                                                                                                                                                                                                      \
+      constexpr bool IsTmaSM10xEpilogue =                                                                                                                                                                                                                                                                               \
+          std::is_same_v<InputEpilogueSchedule, cutlass::epilogue::PtrArrayTmaWarpSpecialized>;                                                                                                                                                                                                                         \
+                                                                                                                                                                                                                                                                                                                        \
+      static_assert(cutlass::platform::is_same<T, SafeBF16>::value ||                                                                                                                                                                                                                                                   \
+                        cutlass::platform::is_same<T, half>::value ||                                                                                                                                                                                                                                                   \
+                        cutlass::platform::is_same<T, float>::value || IsFP8 || IsFP4,                                                                                                                                                                                                                                  \
+                    "Specialized for bfloat16, half, float, fp8, fp4");                                                                                                                                                                                                                                                 \
+                                                                                                                                                                                                                                                                                                                        \
+      /* The cutlass type for the input elements. This is needed to convert to cutlass::half_t if                                                                                                                                                                                                                       \
+       * necessary.*/                                                                                                                                                                                                                                                                                                   \
+      using ElementType = typename TllmToCutlassTypeAdapter<T>::type;                                                                                                                                                                                                                                                   \
+                                                                                                                                                                                                                                                                                                                        \
+      /* TODO The below never trigger, and are incorrect for int8 types anyway                                                                                                                                                                                                                                          \
+      //    using CutlassWeightTypeMaybeUint4 = typename                                                                                                                                                                                                                                                                \
+      TllmToCutlassTypeAdapter<WeightType>::type;                                                                                                                                                                                                                                                                       \
+      //    // For legacy reasons we convert unsigned 8-bit to signed                                                                                                                                                                                                                                                   \
+      //    using CutlassWeightTypeMaybeUint8                                                                                                                                                                                                                                                                           \
+      //        = std::conditional_t<std::is_same_v<CutlassWeightTypeMaybeUint4,                                                                                                                                                                                                                                        \
+      cutlass::uint4b_t>, cutlass::int4b_t,                                                                                                                                                                                                                                                                             \
+      //            CutlassWeightTypeMaybeUint4>;                                                                                                                                                                                                                                                                       \
+      //    using CutlassWeightType                                                                                                                                                                                                                                                                                     \
+      //        = std::conditional_t<std::is_same_v<CutlassWeightTypeMaybeUint8, uint8_t>, int8_t,                                                                                                                                                                                                                      \
+      //        CutlassWeightTypeMaybeUint8>; */                                                                                                                                                                                                                                                                        \
+      using CutlassWeightType = typename TllmToCutlassTypeAdapter<WeightType>::type;                                                                                                                                                                                                                                    \
+                                                                                                                                                                                                                                                                                                                        \
+      using ElementAct = ElementType;                                                                                                                                                                                                                                                                                   \
+      using ElementWeight = CutlassWeightType;                                                                                                                                                                                                                                                                          \
+                                                                                                                                                                                                                                                                                                                        \
+      using ElementD = typename TllmToCutlassTypeAdapter<                                                                                                                                                                                                                                                               \
+          TmaWarpSpecializedGroupedGemmInput::OutputTypeAdaptor_t<OutputType>>::type;                                                                                                                                                                                                                                   \
+      using ElementFinalOutput = typename TllmToCutlassTypeAdapter<OutputType>::type;                                                                                                                                                                                                                                   \
+                                                                                                                                                                                                                                                                                                                        \
+      /* using ElementC = std::conditional_t<BIAS, ElementType, void>; */                                                                                                                                                                                                                                               \
+      /* using ElementCSafe = std::conditional_t<BIAS, ElementType, ElementD>; */                                                                                                                                                                                                                                       \
+      using ElementC = void;                                                                                                                                                                                                                                                                                            \
+      using ElementCSafe = ElementD;                                                                                                                                                                                                                                                                                    \
+                                                                                                                                                                                                                                                                                                                        \
+      using ElementAccumulator = float;                                                                                                                                                                                                                                                                                 \
+                                                                                                                                                                                                                                                                                                                        \
+      using ElementBias = ElementFinalOutput;                                                                                                                                                                                                                                                                           \
+      using ElementRouterScales = float;                                                                                                                                                                                                                                                                                \
+                                                                                                                                                                                                                                                                                                                        \
+      using ElementSF = std::conditional_t<                                                                                                                                                                                                                                                                             \
+          IsMXFPX, cutlass::float_ue8m0_t,                                                                                                                                                                                                                                                                              \
+          cutlass::float_ue4m3_t>; /*TmaWarpSpecializedGroupedGemmInput::ElementSF;*/                                                                                                                                                                                                                                   \
+      using ElementActBlockScaled =                                                                                                                                                                                                                                                                                     \
+          std::conditional_t<IsSM120,                                                                                                                                                                                                                                                                                   \
+                             std::conditional_t<IsMXFPX, cutlass::mx_float8_t<ElementAct>,                                                                                                                                                                                                                              \
+                                                cutlass::nv_float4_t<ElementAct>>,                                                                                                                                                                                                                                      \
+                             cute::tuple<ElementAct, ElementSF>>;                                                                                                                                                                                                                                                       \
+      using ElementWeightBlockScaled =                                                                                                                                                                                                                                                                                  \
+          std::conditional_t<IsSM120,                                                                                                                                                                                                                                                                                   \
+                             std::conditional_t<IsMXFPX, cutlass::mx_float4_t<ElementWeight>,                                                                                                                                                                                                                           \
+                                                cutlass::nv_float4_t<ElementWeight>>,                                                                                                                                                                                                                                   \
+                             cute::tuple<ElementWeight, ElementSF>>;                                                                                                                                                                                                                                                    \
+                                                                                                                                                                                                                                                                                                                        \
+      /* Activation matrix alignment */                                                                                                                                                                                                                                                                                 \
+      constexpr static int AlignmentAct =                                                                                                                                                                                                                                                                               \
+          128 /                                                                                                                                                                                                                                                                                                         \
+          cutlass::sizeof_bits<ElementAct>::value; /* Memory access granularity/alignment of A                                                                                                                                                                                                                          \
+                                                 matrix in units of elements (up to 16 bytes) */                                                                                                                                                                                                                        \
+      /* Weight matrix alignment */                                                                                                                                                                                                                                                                                     \
+      constexpr static int AlignmentWeight =                                                                                                                                                                                                                                                                            \
+          IsWFP4AFP8                                                                                                                                                                                                                                                                                                    \
+              ? 128                                                                                                                                                                                                                                                                                                     \
+              : (128 /                                                                                                                                                                                                                                                                                                  \
+                 cutlass::sizeof_bits<ElementWeight>::value); /* Memory access                                                                                                                                                                                                                                          \
+                                                        granularity/alignment of B matrix in units                                                                                                                                                                                                                      \
+                                                        // of elements (up to 16 bytes)*/                                                                                                                                                                                                                               \
+                                                                                                                                                                                                                                                                                                                        \
+      /* C matrix configuration */                                                                                                                                                                                                                                                                                      \
+      /* Note we use ElementType here deliberately, so we don't break when BIAS is disabled */                                                                                                                                                                                                                          \
+      constexpr static int AlignmentC =                                                                                                                                                                                                                                                                                 \
+          128 / cutlass::sizeof_bits<ElementType>::value; /* Memory access granularity/alignment                                                                                                                                                                                                                        \
+                                                          of C matrix in                                                                                                                                                                                                                                                \
+                                                          // units of elements (up to 16 bytes)*/                                                                                                                                                                                                                       \
+                                                                                                                                                                                                                                                                                                                        \
+      /* D matrix configuration */                                                                                                                                                                                                                                                                                      \
+      constexpr static int AlignmentDBits =                                                                                                                                                                                                                                                                             \
+          (IsSM10x && !IsTmaSM10xEpilogue)                                                                                                                                                                                                                                                                              \
+              ? 256                                                                                                                                                                                                                                                                                                     \
+              : 128; /* For NoSmem epilogue schedule, we need to align to 256 bits */                                                                                                                                                                                                                                   \
+      constexpr static int AlignmentD =                                                                                                                                                                                                                                                                                 \
+          AlignmentDBits / cutlass::sizeof_bits<ElementD>::value; /* Memory access                                                                                                                                                                                                                                      \
+                                                       granularity/alignment of D matrix                                                                                                                                                                                                                                \
+                                                       // in units of elements (up to 16 bytes) */                                                                                                                                                                                                                      \
+                                                                                                                                                                                                                                                                                                                        \
+      static_assert(                                                                                                                                                                                                                                                                                                    \
+          cutlass::platform::is_same<EpilogueTag,                                                                                                                                                                                                                                                                       \
+                                     tensorrt_llm::cutlass_extensions::EpilogueOpDefault>::value,                                                                                                                                                                                                                       \
+          "TMA Warp Specialized Grouped GEMM specialisation doesn't support fused activation");                                                                                                                                                                                                                         \
+                                                                                                                                                                                                                                                                                                                        \
+      using EpilogueOp =                                                                                                                                                                                                                                                                                                \
+          cutlass::epilogue::fusion::LinearCombination<ElementD, ElementAccumulator, ElementC,                                                                                                                                                                                                                          \
+                                                       ElementAccumulator>;                                                                                                                                                                                                                                             \
+                                                                                                                                                                                                                                                                                                                        \
+      /* TODO Add mode for fused activation once CUTLASS adds support                                                                                                                                                                                                                                                   \
+      //  using EpilogueSchedule = cutlass::platform::conditional_t<                                                                                                                                                                                                                                                    \
+      //        cutlass::platform::is_same<EpilogueOp, EpilogueOpDefault>::value,                                                                                                                                                                                                                                       \
+      //        cutlass::epilogue::PtrArrayNoSmemWarpSpecialized,                                                                                                                                                                                                                                                       \
+      //        cutlass::epilogue::??????????????????             /// <<<<<< what supports                                                                                                                                                                                                                              \
+      activations                                                                                                                                                                                                                                                                                                       \
+      //        >;*/                                                                                                                                                                                                                                                                                                    \
+      using EpilogueScheduleSM90 = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;                                                                                                                                                                                                                            \
+                                                                                                                                                                                                                                                                                                                        \
+      using EpilogueScheduleSM10x = std::conditional_t<                                                                                                                                                                                                                                                                 \
+          IsTmaSM10xEpilogue,                                                                                                                                                                                                                                                                                           \
+          std::conditional_t<Is2SM, cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm,                                                                                                                                                                                                                                   \
+                             cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm>,                                                                                                                                                                                                                                         \
+          std::conditional_t<Is2SM, cutlass::epilogue::PtrArrayNoSmemWarpSpecialized2Sm,                                                                                                                                                                                                                                \
+                             cutlass::epilogue::PtrArrayNoSmemWarpSpecialized1Sm>>;                                                                                                                                                                                                                                     \
+      using EpilogueScheduleSM120 = cutlass::epilogue::TmaWarpSpecialized;                                                                                                                                                                                                                                              \
+      using EpilogueSchedule = std::conditional_t<                                                                                                                                                                                                                                                                      \
+          IsSM10x, EpilogueScheduleSM10x,                                                                                                                                                                                                                                                                               \
+          std::conditional_t<IsSM120, EpilogueScheduleSM120, EpilogueScheduleSM90>>;                                                                                                                                                                                                                                    \
+      using EpilogueElementC = std::conditional_t<IsSM120, ElementCSafe, ElementC>;                                                                                                                                                                                                                                     \
+      using EpilogueTensorOp = std::conditional_t<IsBlackwell && IsBlockScaled,                                                                                                                                                                                                                                         \
+                                                  cutlass::arch::OpClassBlockScaledTensorOp,                                                                                                                                                                                                                            \
+                                                  cutlass::arch::OpClassTensorOp>;                                                                                                                                                                                                                                      \
+      using EpilogueScheduleSM10xFinalize = std::conditional_t<                                                                                                                                                                                                                                                         \
+          !IsFinalizeFusion && IsSM10x,                                                                                                                                                                                                                                                                                 \
+          std::conditional_t<Is2SM, cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm,                                                                                                                                                                                                                                   \
+                             cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm>,                                                                                                                                                                                                                                         \
+          EpilogueSchedule>; /* This still needs to be valid when finalize fusion is disabled */                                                                                                                                                                                                                        \
+                                                                                                                                                                                                                                                                                                                        \
+      using EpilogueSubTile = std::conditional_t<                                                                                                                                                                                                                                                                       \
+          ArchTag::kMinComputeCapability == 100 && IsFP4 && CTA_N_ == 256, /* SM100 Exactly */                                                                                                                                                                                                                          \
+          cute::Shape<cute::_128, cute::_64>, cutlass::epilogue::collective::EpilogueTileAuto>;                                                                                                                                                                                                                         \
+                                                                                                                                                                                                                                                                                                                        \
+      using LayoutC = std::conditional_t<SwapAB, TmaWarpSpecializedGroupedGemmInput::LayoutC_T,                                                                                                                                                                                                                         \
+                                         TmaWarpSpecializedGroupedGemmInput::LayoutC>;                                                                                                                                                                                                                                  \
+      using StrideC = std::conditional_t<SwapAB, TmaWarpSpecializedGroupedGemmInput::StrideC_T,                                                                                                                                                                                                                         \
+                                         TmaWarpSpecializedGroupedGemmInput::StrideC>;                                                                                                                                                                                                                                  \
+      using LayoutD = std::conditional_t<SwapAB, TmaWarpSpecializedGroupedGemmInput::LayoutD_T,                                                                                                                                                                                                                         \
+                                         TmaWarpSpecializedGroupedGemmInput::LayoutD>;                                                                                                                                                                                                                                  \
+      using StrideD = std::conditional_t<SwapAB, TmaWarpSpecializedGroupedGemmInput::StrideD_T,                                                                                                                                                                                                                         \
+                                         TmaWarpSpecializedGroupedGemmInput::StrideD>;                                                                                                                                                                                                                                  \
+                                                                                                                                                                                                                                                                                                                        \
+      /* Epilogue For Default Finalize */                                                                                                                                                                                                                                                                               \
+      using CollectiveEpilogueDefault = typename cutlass::epilogue::collective::                                                                                                                                                                                                                                        \
+          CollectiveBuilder<                                        /**/                                                                                                                                                                                                                                                \
+                            ArchTag, EpilogueTensorOp,              /**/                                                                                                                                                                                                                                                \
+                            MmaTileShape, ClusterShape,             /**/                                                                                                                                                                                                                                                \
+                            EpilogueSubTile,                        /**/                                                                                                                                                                                                                                                \
+                            ElementAccumulator, ElementAccumulator, /**/                                                                                                                                                                                                                                                \
+                            EpilogueElementC, LayoutC*, AlignmentC, /**/                                                                                                                                                                                                                                                \
+                            ElementD, LayoutD*, AlignmentD,         /**/                                                                                                                                                                                                                                                \
+                            EpilogueSchedule>::CollectiveOp;                                                                                                                                                                                                                                                            \
+                                                                                                                                                                                                                                                                                                                        \
+      /* Epilogue For Fused Finalize */                                                                                                                                                                                                                                                                                 \
+      using EpilogueFusionOp = std::conditional_t<                                                                                                                                                                                                                                                                      \
+          SwapAB,                                                                                                                                                                                                                                                                                                       \
+          cutlass::epilogue::fusion::ScaledAccPerRowBiasPerColScaleScatter<                                                                                                                                                                                                                                             \
+              LayoutD, ElementFinalOutput, ElementAccumulator, ElementBias, ElementRouterScales>,                                                                                                                                                                                                                       \
+          cutlass::epilogue::fusion::ScaledAccPerColBiasPerRowScaleScatter<                                                                                                                                                                                                                                             \
+              LayoutD, ElementFinalOutput, ElementAccumulator, ElementBias, ElementRouterScales>>;                                                                                                                                                                                                                      \
+      using CollectiveEpilogueFinalize = typename cutlass::epilogue::collective::                                                                                                                                                                                                                                       \
+          CollectiveBuilder<                                        /**/                                                                                                                                                                                                                                                \
+                            ArchTag, EpilogueTensorOp,              /**/                                                                                                                                                                                                                                                \
+                            MmaTileShape, InputClusterShape,        /**/                                                                                                                                                                                                                                                \
+                            EpilogueSubTile,                        /**/                                                                                                                                                                                                                                                \
+                            ElementAccumulator, ElementAccumulator, /**/                                                                                                                                                                                                                                                \
+                            EpilogueElementC, LayoutC*, AlignmentC, /**/                                                                                                                                                                                                                                                \
+                            void, LayoutD*, AlignmentD,             /**/                                                                                                                                                                                                                                                \
+                            EpilogueScheduleSM10xFinalize,          /**/                                                                                                                                                                                                                                                \
+                            EpilogueFusionOp                        /**/                                                                                                                                                                                                                                                \
+                            >::CollectiveOp;                                                                                                                                                                                                                                                                            \
+                                                                                                                                                                                                                                                                                                                        \
+      using CollectiveEpilogue = std::conditional_t<IsFinalizeFusion, CollectiveEpilogueFinalize,                                                                                                                                                                                                                       \
+                                                    CollectiveEpilogueDefault>;                                                                                                                                                                                                                                         \
+                                                                                                                                                                                                                                                                                                                        \
+      using StageCountAutoCarveout =                                                                                                                                                                                                                                                                                    \
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(                                                                                                                                                                                                                                           \
+              sizeof(typename CollectiveEpilogue::SharedStorage))>;                                                                                                                                                                                                                                                     \
+                                                                                                                                                                                                                                                                                                                        \
+      using KernelScheduleSM90 = std::conditional_t<                                                                                                                                                                                                                                                                    \
+          IsFP8, cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum,                                                                                                                                                                                                                                \
+          cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative>;                                                                                                                                                                                                                                                  \
+                                                                                                                                                                                                                                                                                                                        \
+      using KernelSchedule2SmSm100BlockScaled =                                                                                                                                                                                                                                                                         \
+          std::conditional_t<IsMXFPX,                                                                                                                                                                                                                                                                                   \
+                             cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100,                                                                                                                                                                                                                           \
+                             cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmNvf4Sm100>;                                                                                                                                                                                                                              \
+      using KernelSchedule1SmSm100BlockScaled =                                                                                                                                                                                                                                                                         \
+          std::conditional_t<IsMXFPX,                                                                                                                                                                                                                                                                                   \
+                             cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100,                                                                                                                                                                                                                           \
+                             cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmNvf4Sm100>;                                                                                                                                                                                                                              \
+                                                                                                                                                                                                                                                                                                                        \
+      /* TRT-LLM uses vector size 16 for block scaled */                                                                                                                                                                                                                                                                \
+      using KernelScheduleSM100 = std::conditional_t<                                                                                                                                                                                                                                                                   \
+          Is2SM,                                                                                                                                                                                                                                                                                                        \
+          std::conditional_t<IsBlockScaled, KernelSchedule2SmSm100BlockScaled,                                                                                                                                                                                                                                          \
+                             cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100>,                                                                                                                                                                                                                                  \
+          std::conditional_t<IsBlockScaled, KernelSchedule1SmSm100BlockScaled,                                                                                                                                                                                                                                          \
+                             cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100>>;                                                                                                                                                                                                                                 \
+      using KernelScheduleSM103 = std::conditional_t<                                                                                                                                                                                                                                                                   \
+          Is2SM,                                                                                                                                                                                                                                                                                                        \
+          cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103,                                                                                                                                                                                                                            \
+          cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103>;                                                                                                                                                                                                                           \
+      using KernelScheduleSM10x =                                                                                                                                                                                                                                                                                       \
+          std::conditional_t<IsSM103, KernelScheduleSM103, KernelScheduleSM100>;                                                                                                                                                                                                                                        \
+      using KernelScheduleSM120 = cutlass ::gemm ::collective::KernelScheduleAuto;                                                                                                                                                                                                                                      \
+      using KernelScheduleBW =                                                                                                                                                                                                                                                                                          \
+          std::conditional_t<IsSM120, KernelScheduleSM120, KernelScheduleSM10x>;                                                                                                                                                                                                                                        \
+                                                                                                                                                                                                                                                                                                                        \
+      using KernelSchedule =                                                                                                                                                                                                                                                                                            \
+          std::conditional_t<IsBlackwell, KernelScheduleBW, KernelScheduleSM90>;                                                                                                                                                                                                                                        \
+                                                                                                                                                                                                                                                                                                                        \
+      using TensorOp = std::conditional_t<IsBlackwell && IsBlockScaled,                                                                                                                                                                                                                                                 \
+                                          cutlass::arch::OpClassBlockScaledTensorOp,                                                                                                                                                                                                                                    \
+                                          cutlass::arch::OpClassTensorOp>;                                                                                                                                                                                                                                              \
+                                                                                                                                                                                                                                                                                                                        \
+      using MainloopElementAct =                                                                                                                                                                                                                                                                                        \
+          std::conditional_t<IsBlackwell && IsBlockScaled, ElementActBlockScaled, ElementAct>;                                                                                                                                                                                                                          \
+      using MainloopElementWeight = std::conditional_t<IsBlackwell && IsBlockScaled,                                                                                                                                                                                                                                    \
+                                                       ElementWeightBlockScaled, ElementWeight>;                                                                                                                                                                                                                        \
+      using SwappedMainloopElementA =                                                                                                                                                                                                                                                                                   \
+          std::conditional_t<SwapAB, MainloopElementWeight, MainloopElementAct>;                                                                                                                                                                                                                                        \
+      using SwappedMainloopElementB =                                                                                                                                                                                                                                                                                   \
+          std::conditional_t<SwapAB, MainloopElementAct, MainloopElementWeight>;                                                                                                                                                                                                                                        \
+      constexpr auto SwappedAlignmentA = SwapAB ? AlignmentWeight : AlignmentAct;                                                                                                                                                                                                                                       \
+      constexpr auto SwappedAlignmentB = SwapAB ? AlignmentAct : AlignmentWeight;                                                                                                                                                                                                                                       \
+      using LayoutA = TmaWarpSpecializedGroupedGemmInput::LayoutA;                                                                                                                                                                                                                                                      \
+      using LayoutB = TmaWarpSpecializedGroupedGemmInput::LayoutB;                                                                                                                                                                                                                                                      \
+      using StrideA = typename TmaWarpSpecializedGroupedGemmInput::StrideA;                                                                                                                                                                                                                                             \
+      using StrideB = typename TmaWarpSpecializedGroupedGemmInput::StrideB;                                                                                                                                                                                                                                             \
+      using CollectiveMainloop = typename cutlass::gemm::collective::                                                                                                                                                                                                                                                   \
+          CollectiveBuilder<                                                      /**/                                                                                                                                                                                                                                  \
+                            ArchTag, TensorOp,                                    /**/                                                                                                                                                                                                                                  \
+                            SwappedMainloopElementA, LayoutA*, SwappedAlignmentA, /**/                                                                                                                                                                                                                                  \
+                            SwappedMainloopElementB, LayoutB*, SwappedAlignmentB, /**/                                                                                                                                                                                                                                  \
+                            ElementAccumulator,                                   /**/                                                                                                                                                                                                                                  \
+                            MmaTileShape, ClusterShape,                           /**/                                                                                                                                                                                                                                  \
+                            StageCountAutoCarveout, KernelSchedule>::CollectiveOp;                                                                                                                                                                                                                                      \
+                                                                                                                                                                                                                                                                                                                        \
+      using GemmKernel =                                                                                                                                                                                                                                                                                                \
+          cutlass::gemm::kernel::GemmUniversal<TmaWarpSpecializedGroupedGemmInput::ProblemShape,                                                                                                                                                                                                                        \
+                                               CollectiveMainloop, CollectiveEpilogue, void,                                                                                                                                                                                                                            \
+                                               void>;                                                                                                                                                                                                                                                                   \
+                                                                                                                                                                                                                                                                                                                        \
+      using GemmGrouped = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;                                                                                                                                                                                                                                      \
+                                                                                                                                                                                                                                                                                                                        \
+      if (kernel_occupancy != nullptr) {                                                                                                                                                                                                                                                                                \
+        TLLM_THROW("TMA WS kernels do not support calculating occupancy");                                                                                                                                                                                                                                              \
+        return;                                                                                                                                                                                                                                                                                                         \
+      }                                                                                                                                                                                                                                                                                                                 \
+                                                                                                                                                                                                                                                                                                                        \
+      cutlass::KernelHardwareInfo hw_info;                                                                                                                                                                                                                                                                              \
+      hw_info.device_id = 0;                                                                                                                                                                                                                                                                                            \
+      hw_info.sm_count = multi_processor_count;                                                                                                                                                                                                                                                                         \
+                                                                                                                                                                                                                                                                                                                        \
+      if constexpr (DYNAMIC_CGA) {                                                                                                                                                                                                                                                                                      \
+        TLLM_CHECK(cute::size<0>(dynamic_cluster_shape) >= 1);                                                                                                                                                                                                                                                          \
+        TLLM_CHECK(cute::size<1>(dynamic_cluster_shape) >= 1);                                                                                                                                                                                                                                                          \
+        TLLM_CHECK(cute::size<0>(fallback_cluster_shape) >= 1);                                                                                                                                                                                                                                                         \
+        TLLM_CHECK(cute::size<1>(fallback_cluster_shape) >= 1);                                                                                                                                                                                                                                                         \
+        TLLM_CHECK_WITH_INFO(                                                                                                                                                                                                                                                                                           \
+            cute::size<0>(dynamic_cluster_shape) % cute::size<0>(fallback_cluster_shape) == 0,                                                                                                                                                                                                                          \
+            "Dynamic cluster shape (%dx%d) must be divisible by cluster shape (%dx%d)",                                                                                                                                                                                                                                 \
+            (int)cute::size<0>(dynamic_cluster_shape), (int)cute::size<1>(dynamic_cluster_shape),                                                                                                                                                                                                                       \
+            (int)cute::size<0>(fallback_cluster_shape),                                                                                                                                                                                                                                                                 \
+            (int)cute::size<1>(fallback_cluster_shape));                                                                                                                                                                                                                                                                \
+        TLLM_CHECK_WITH_INFO(                                                                                                                                                                                                                                                                                           \
+            cute::size<0>(fallback_cluster_shape) % cute::size<0>(InputClusterShape{}) == 0,                                                                                                                                                                                                                            \
+            "Fallback cluster shape (%dx%d) must be divisible by MMA cluster shape (%dx%d)",                                                                                                                                                                                                                            \
+            (int)cute::size<0>(fallback_cluster_shape),                                                                                                                                                                                                                                                                 \
+            (int)cute::size<1>(fallback_cluster_shape), (int)cute::size<0>(InputClusterShape{}),                                                                                                                                                                                                                        \
+            (int)cute::size<1>(InputClusterShape{}));                                                                                                                                                                                                                                                                   \
+        hw_info.cluster_shape =                                                                                                                                                                                                                                                                                         \
+            dim3(cute::size<0>(dynamic_cluster_shape), cute::size<1>(dynamic_cluster_shape), 1);                                                                                                                                                                                                                        \
+        hw_info.cluster_shape_fallback =                                                                                                                                                                                                                                                                                \
+            dim3(cute::size<0>(fallback_cluster_shape), cute::size<1>(fallback_cluster_shape), 1);                                                                                                                                                                                                                      \
+      }                                                                                                                                                                                                                                                                                                                 \
+      GemmGrouped gemm;                                                                                                                                                                                                                                                                                                 \
+                                                                                                                                                                                                                                                                                                                        \
+      if (workspace_size != nullptr) {                                                                                                                                                                                                                                                                                  \
+        /* Make a mock problem shape with just the minimal information actually required to get                                                                                                                                                                                                                         \
+        the workspace                                                                                                                                                                                                                                                                                                   \
+        // size This makes some assumptions about CUTLASS's implementation which is suboptimal. We                                                                                                                                                                                                                      \
+        have a check                                                                                                                                                                                                                                                                                                    \
+        // later to catch future cutlass updates causing silent breakages, but that is not fool                                                                                                                                                                                                                         \
+        proof. The                                                                                                                                                                                                                                                                                                      \
+        // alternative is to wait until we have data and then dynamically allocate the workspace*/                                                                                                                                                                                                                      \
+        typename TmaWarpSpecializedGroupedGemmInput::ProblemShape shape_info{num_experts, nullptr,                                                                                                                                                                                                                      \
+                                                                             nullptr};                                                                                                                                                                                                                                  \
+                                                                                                                                                                                                                                                                                                                        \
+        typename GemmKernel::TileScheduler::Arguments scheduler_args{                                                                                                                                                                                                                                                   \
+            1, GemmKernel::TileScheduler::RasterOrderOptions::AlongN};                                                                                                                                                                                                                                                  \
+        const typename GemmGrouped::Arguments args{cutlass::gemm::GemmUniversalMode::kGrouped,                                                                                                                                                                                                                          \
+                                                   shape_info,                                                                                                                                                                                                                                                          \
+                                                   {},                                                                                                                                                                                                                                                                  \
+                                                   {},                                                                                                                                                                                                                                                                  \
+                                                   hw_info,                                                                                                                                                                                                                                                             \
+                                                   scheduler_args};                                                                                                                                                                                                                                                     \
+        *workspace_size = gemm.get_workspace_size(args);                                                                                                                                                                                                                                                                \
+        return;                                                                                                                                                                                                                                                                                                         \
+      }                                                                                                                                                                                                                                                                                                                 \
+                                                                                                                                                                                                                                                                                                                        \
+      using MainloopArguments = typename CollectiveMainloop::Arguments;                                                                                                                                                                                                                                                 \
+      TLLM_CHECK(tma_ws_input.stride_act);                                                                                                                                                                                                                                                                              \
+      TLLM_CHECK(tma_ws_input.stride_weight);                                                                                                                                                                                                                                                                           \
+      TLLM_CHECK(tma_ws_input.ptr_act);                                                                                                                                                                                                                                                                                 \
+      TLLM_CHECK(tma_ws_input.ptr_weight);                                                                                                                                                                                                                                                                              \
+                                                                                                                                                                                                                                                                                                                        \
+      MainloopArguments const mainloop_args = [&] {                                                                                                                                                                                                                                                                     \
+        if constexpr (IsBlockScaled) {                                                                                                                                                                                                                                                                                  \
+          if constexpr (SwapAB) {                                                                                                                                                                                                                                                                                       \
+            return construct_if_true<(IsBlockScaled && SwapAB), MainloopArguments>(                                                                                                                                                                                                                                     \
+                reinterpret_cast<ElementWeight const**>(tma_ws_input.ptr_weight),                                                                                                                                                                                                                                       \
+                reinterpret_cast<StrideA*>(tma_ws_input.stride_weight),                                                                                                                                                                                                                                                 \
+                reinterpret_cast<ElementAct const**>(tma_ws_input.ptr_act),                                                                                                                                                                                                                                             \
+                reinterpret_cast<StrideB*>(tma_ws_input.stride_act),                                                                                                                                                                                                                                                    \
+                reinterpret_cast<ElementSF const**>(                                                                                                                                                                                                                                                                    \
+                    tma_ws_input.fpX_block_scaling_factors_weight),                                                                                                                                                                                                                                                     \
+                reinterpret_cast<decltype(deduce_layout_sf<IsBlockScaled, GemmGrouped, true>())>(                                                                                                                                                                                                                       \
+                    tma_ws_input.fpX_block_scaling_factors_stride_weight),                                                                                                                                                                                                                                              \
+                reinterpret_cast<ElementSF const**>(tma_ws_input.fpX_block_scaling_factors_act),                                                                                                                                                                                                                        \
+                reinterpret_cast<decltype(deduce_layout_sf<IsBlockScaled, GemmGrouped, false>())>(                                                                                                                                                                                                                      \
+                    tma_ws_input.fpX_block_scaling_factors_stride_act));                                                                                                                                                                                                                                                \
+          } else {                                                                                                                                                                                                                                                                                                      \
+            return construct_if_true<(IsBlockScaled && !SwapAB), MainloopArguments>(                                                                                                                                                                                                                                    \
+                reinterpret_cast<ElementAct const**>(tma_ws_input.ptr_act),                                                                                                                                                                                                                                             \
+                reinterpret_cast<StrideA*>(tma_ws_input.stride_act),                                                                                                                                                                                                                                                    \
+                reinterpret_cast<ElementWeight const**>(tma_ws_input.ptr_weight),                                                                                                                                                                                                                                       \
+                reinterpret_cast<StrideB*>(tma_ws_input.stride_weight),                                                                                                                                                                                                                                                 \
+                reinterpret_cast<ElementSF const**>(tma_ws_input.fpX_block_scaling_factors_act),                                                                                                                                                                                                                        \
+                reinterpret_cast<decltype(deduce_layout_sf<IsBlockScaled, GemmGrouped, true>())>(                                                                                                                                                                                                                       \
+                    tma_ws_input.fpX_block_scaling_factors_stride_act),                                                                                                                                                                                                                                                 \
+                reinterpret_cast<ElementSF const**>(                                                                                                                                                                                                                                                                    \
+                    tma_ws_input.fpX_block_scaling_factors_weight),                                                                                                                                                                                                                                                     \
+                reinterpret_cast<decltype(deduce_layout_sf<IsBlockScaled, GemmGrouped, false>())>(                                                                                                                                                                                                                      \
+                    tma_ws_input.fpX_block_scaling_factors_stride_weight));                                                                                                                                                                                                                                             \
+          }                                                                                                                                                                                                                                                                                                             \
+        } else {                                                                                                                                                                                                                                                                                                        \
+          if constexpr (SwapAB) {                                                                                                                                                                                                                                                                                       \
+            return construct_if_true<(!IsBlockScaled && SwapAB), MainloopArguments>(                                                                                                                                                                                                                                    \
+                reinterpret_cast<ElementWeight const**>(tma_ws_input.ptr_weight),                                                                                                                                                                                                                                       \
+                reinterpret_cast<StrideA*>(tma_ws_input.stride_weight),                                                                                                                                                                                                                                                 \
+                reinterpret_cast<ElementAct const**>(tma_ws_input.ptr_act),                                                                                                                                                                                                                                             \
+                reinterpret_cast<StrideB*>(tma_ws_input.stride_act));                                                                                                                                                                                                                                                   \
+          } else {                                                                                                                                                                                                                                                                                                      \
+            return construct_if_true<(!IsBlockScaled && !SwapAB), MainloopArguments>(                                                                                                                                                                                                                                   \
+                reinterpret_cast<ElementAct const**>(tma_ws_input.ptr_act),                                                                                                                                                                                                                                             \
+                reinterpret_cast<StrideA*>(tma_ws_input.stride_act),                                                                                                                                                                                                                                                    \
+                reinterpret_cast<ElementWeight const**>(tma_ws_input.ptr_weight),                                                                                                                                                                                                                                       \
+                reinterpret_cast<StrideB*>(tma_ws_input.stride_weight));                                                                                                                                                                                                                                                \
+          }                                                                                                                                                                                                                                                                                                             \
+        }                                                                                                                                                                                                                                                                                                               \
+      }();                                                                                                                                                                                                                                                                                                              \
+      using EpilogueArguments = typename CollectiveEpilogue::Arguments;                                                                                                                                                                                                                                                 \
+      using EpilogueScalars = decltype(EpilogueArguments{}.thread);                                                                                                                                                                                                                                                     \
+      EpilogueScalars epilogue_scalars = [&] {                                                                                                                                                                                                                                                                          \
+        constexpr bool IsSimpleAlphaBeta =                                                                                                                                                                                                                                                                              \
+            std::is_constructible_v<EpilogueScalars, ElementAccumulator, ElementAccumulator>;                                                                                                                                                                                                                           \
+        if constexpr (IsFinalizeFusion) {                                                                                                                                                                                                                                                                               \
+          auto epi_params = tma_ws_input.fused_finalize_epilogue;                                                                                                                                                                                                                                                       \
+          if constexpr (SwapAB) {                                                                                                                                                                                                                                                                                       \
+            return construct_if_true<(FUSION == EpilogueFusion::FINALIZE && SwapAB),                                                                                                                                                                                                                                    \
+                                     EpilogueScalars>(                                                                                                                                                                                                                                                                  \
+                ElementAccumulator(1), nullptr, tma_ws_input.alpha_scale_ptr_array,                                                                                                                                                                                                                                     \
+                Stride<_0, _0, int64_t>{cute::_0{}, cute::_0{}, 1}, /* alpha */                                                                                                                                                                                                                                         \
+                reinterpret_cast<ElementBias const* const*>(epi_params.ptr_bias),                                                                                                                                                                                                                                       \
+                Stride<_1, _0, int64_t>{},                               /* bias  */                                                                                                                                                                                                                                    \
+                epi_params.ptr_router_scales, Stride<_0, _1, int64_t>{}, /* scale */                                                                                                                                                                                                                                    \
+                reinterpret_cast<ElementFinalOutput*>(epi_params.ptr_final_output),                                                                                                                                                                                                                                     \
+                epi_params.stride_final_output_transposed, epi_params.ptr_source_token_index,                                                                                                                                                                                                                           \
+                epi_params.num_rows_in_final_output, epi_params.shape_override,                                                                                                                                                                                                                                         \
+                epi_params.use_reduction);                                                                                                                                                                                                                                                                              \
+          } else {                                                                                                                                                                                                                                                                                                      \
+            return construct_if_true<(FUSION == EpilogueFusion::FINALIZE && !SwapAB),                                                                                                                                                                                                                                   \
+                                     EpilogueScalars>(                                                                                                                                                                                                                                                                  \
+                ElementAccumulator(1), nullptr, tma_ws_input.alpha_scale_ptr_array,                                                                                                                                                                                                                                     \
+                Stride<_0, _0, int64_t>{cute::_0{}, cute::_0{}, 1}, /* alpha */                                                                                                                                                                                                                                         \
+                reinterpret_cast<ElementBias const* const*>(epi_params.ptr_bias),                                                                                                                                                                                                                                       \
+                Stride<_0, _1, int64_t>{},                               /* bias  */                                                                                                                                                                                                                                    \
+                epi_params.ptr_router_scales, Stride<_1, _0, int64_t>{}, /* scale */                                                                                                                                                                                                                                    \
+                reinterpret_cast<ElementFinalOutput*>(epi_params.ptr_final_output),                                                                                                                                                                                                                                     \
+                epi_params.stride_final_output, epi_params.ptr_source_token_index,                                                                                                                                                                                                                                      \
+                epi_params.num_rows_in_final_output, epi_params.shape_override,                                                                                                                                                                                                                                         \
+                epi_params.use_reduction);                                                                                                                                                                                                                                                                              \
+          }                                                                                                                                                                                                                                                                                                             \
+        } else if constexpr (!IsSimpleAlphaBeta) {                                                                                                                                                                                                                                                                      \
+          return construct_if_true<(!IsSimpleAlphaBeta && !IsFinalizeFusion), EpilogueScalars>(                                                                                                                                                                                                                         \
+              ElementAccumulator(1.f),                                                                                                                                                                                                                                                                                  \
+              tma_ws_input.ptr_c ? ElementAccumulator(1.f) : ElementAccumulator(0.f), nullptr,                                                                                                                                                                                                                          \
+              nullptr, tma_ws_input.alpha_scale_ptr_array, nullptr,                                                                                                                                                                                                                                                     \
+              cute::Shape<_0, _0, int64_t>{                                                                                                                                                                                                                                                                             \
+                  cute::_0{}, cute::_0{},                                                                                                                                                                                                                                                                               \
+                  (tma_ws_input.alpha_scale_ptr_array != nullptr) ? 1 : 0},                                                                                                                                                                                                                                             \
+              cute::Shape<_0, _0, int64_t>{cute::_0{}, cute::_0{}, 0});                                                                                                                                                                                                                                                 \
+        } else if (tma_ws_input.alpha_scale_ptr_array) {                                                                                                                                                                                                                                                                \
+          return construct_if_true<(IsSimpleAlphaBeta && !IsFinalizeFusion), EpilogueScalars>(                                                                                                                                                                                                                          \
+              tma_ws_input.alpha_scale_ptr_array);                                                                                                                                                                                                                                                                      \
+        } else {                                                                                                                                                                                                                                                                                                        \
+          return construct_if_true<(IsSimpleAlphaBeta && !IsFinalizeFusion), EpilogueScalars>(                                                                                                                                                                                                                          \
+              ElementAccumulator(1.f),                                                                                                                                                                                                                                                                                  \
+              tma_ws_input.ptr_c ? ElementAccumulator(1.f) : ElementAccumulator(0.f));                                                                                                                                                                                                                                  \
+        }                                                                                                                                                                                                                                                                                                               \
+      }();                                                                                                                                                                                                                                                                                                              \
+                                                                                                                                                                                                                                                                                                                        \
+      EpilogueArguments epilogue_args = [&] {                                                                                                                                                                                                                                                                           \
+        if constexpr (FUSION == EpilogueFusion::FINALIZE) {                                                                                                                                                                                                                                                             \
+          return construct_if_true < FUSION == EpilogueFusion::FINALIZE,                                                                                                                                                                                                                                                \
+                 EpilogueArguments > (epilogue_scalars, nullptr, nullptr, nullptr, nullptr);                                                                                                                                                                                                                            \
+        } else {                                                                                                                                                                                                                                                                                                        \
+          return construct_if_true < FUSION != EpilogueFusion::FINALIZE,                                                                                                                                                                                                                                                \
+                 EpilogueArguments > (epilogue_scalars, nullptr, nullptr,                                                                                                                                                                                                                                               \
+                                      reinterpret_cast<ElementD**>(tma_ws_input.ptr_d),                                                                                                                                                                                                                                 \
+                                      reinterpret_cast<StrideD*>(tma_ws_input.stride_d));                                                                                                                                                                                                                               \
+        }                                                                                                                                                                                                                                                                                                               \
+      }();                                                                                                                                                                                                                                                                                                              \
+                                                                                                                                                                                                                                                                                                                        \
+      typename GemmKernel::TileScheduler::Arguments scheduler_args{                                                                                                                                                                                                                                                     \
+          1, GemmKernel::TileScheduler::RasterOrderOptions::AlongN};                                                                                                                                                                                                                                                    \
+                                                                                                                                                                                                                                                                                                                        \
+      const typename GemmGrouped::Arguments args{cutlass::gemm::GemmUniversalMode::kGrouped,                                                                                                                                                                                                                            \
+                                                 tma_ws_input.shape_info,                                                                                                                                                                                                                                               \
+                                                 mainloop_args,                                                                                                                                                                                                                                                         \
+                                                 epilogue_args,                                                                                                                                                                                                                                                         \
+                                                 hw_info,                                                                                                                                                                                                                                                               \
+                                                 scheduler_args};                                                                                                                                                                                                                                                       \
+                                                                                                                                                                                                                                                                                                                        \
+      size_t calculated_ws_size = gemm.get_workspace_size(args);                                                                                                                                                                                                                                                        \
+      TLLM_CHECK_WITH_INFO(calculated_ws_size <= tma_ws_input.gemm_workspace_size,                                                                                                                                                                                                                                      \
+                           "Workspace is size %zu but only %zu were allocated",                                                                                                                                                                                                                                         \
+                           calculated_ws_size, tma_ws_input.gemm_workspace_size);                                                                                                                                                                                                                                       \
+                                                                                                                                                                                                                                                                                                                        \
+      auto can_implement = gemm.can_implement(args);                                                                                                                                                                                                                                                                    \
+      TLLM_CHECK_WITH_INFO(can_implement == cutlass::Status::kSuccess,                                                                                                                                                                                                                                                  \
+                           "Grouped GEMM kernel will fail for params. Error: " +                                                                                                                                                                                                                                        \
+                               std::string(cutlass::cutlassGetStatusString(can_implement)));                                                                                                                                                                                                                            \
+                                                                                                                                                                                                                                                                                                                        \
+      auto init_status = gemm.initialize(args, tma_ws_input.gemm_workspace);                                                                                                                                                                                                                                            \
+      TLLM_CHECK_WITH_INFO(init_status == cutlass::Status::kSuccess,                                                                                                                                                                                                                                                    \
+                           "Failed to initialize cutlass TMA WS grouped gemm. Error: " +                                                                                                                                                                                                                                \
+                               std::string(cutlass::cutlassGetStatusString(init_status)));                                                                                                                                                                                                                              \
+      auto run_status = gemm.run(stream, nullptr, tma_ws_input.enable_pdl);                                                                                                                                                                                                                                             \
+      TLLM_CHECK_WITH_INFO(run_status == cutlass::Status::kSuccess,                                                                                                                                                                                                                                                     \
+                           "Failed to run cutlass TMA WS grouped gemm. Error: " +                                                                                                                                                                                                                                       \
+                               std::string(cutlass::cutlassGetStatusString(run_status)));                                                                                                                                                                                                                               \
+      sync_check_cuda_error(stream);                                                                                                                                                                                                                                                                                    \
+    } else {                                                                                                                                                                                                                                                                                                            \
+      TLLM_THROW("Configuration was disabled by FAST_BUILD");                                                                                                                                                                                                                                                           \
+    }                                                                                                                                                                                                                                                                                                                   \
+                                                                                                                                                                                                                                                                                                                        \
+    return;                                                                                                                                                                                                                                                                                                             \
+  }                                                                                                                                                                                                                                                                                                                     \
+                                                                                                                                                                                                                                                                                                                        \
+  template <>                                                                                                                                                                                                                                                                                                           \
+  struct DispatchToTmaWSFunction<                                                                                                                                                                                                                                                                                       \
+      cutlass::arch::ArchTag_, DataType_, WeightType_, OutputType_, EpilogueSchedule_,                                                                                                                                                                                                                                  \
+      tensorrt_llm::cutlass_extensions::EpilogueTag_, EpilogueFusion::FUSION_,                                                                                                                                                                                                                                          \
+      cute::Shape<cute::Int<CTA_M_>, cute::Int<CTA_N_>, cute::Int<CTA_K_>>,                                                                                                                                                                                                                                             \
+      cute::Shape<cute::Int<CGA_M_>, cute::Int<CGA_N_>, cute::Int<CGA_K_>>, MXFPX_, DYNAMIC_CGA_,                                                                                                                                                                                                                       \
+      BIAS_, SWAP_AB_> {                                                                                                                                                                                                                                                                                                \
+    constexpr static auto* op = &tma_warp_specialized_generic_moe_gemm_kernelLauncher_##ArchTag_##_##DataType_##_##WeightType_##_##OutputType_##_##EpilogueSchedule_##_##EpilogueTag_##_##FUSION_##_##CTA_M_##_##CTA_N_##_##CTA_K_##_##CGA_M_##_##CGA_N_##_##CGA_K_##_##MXFPX_##_##DYNAMIC_CGA_##_##BIAS_##_##SWAP_AB_; \
+  };                                                                                                                                                                                                                                                                                                                    \
+  template void tma_warp_specialized_generic_moe_gemm_kernelLauncher<                                                                                                                                                                                                                                                   \
+      cutlass::arch::ArchTag_, DataType_, WeightType_, OutputType_, EpilogueSchedule_,                                                                                                                                                                                                                                  \
+      tensorrt_llm::cutlass_extensions::EpilogueTag_, EpilogueFusion::FUSION_,                                                                                                                                                                                                                                          \
+      cute::Shape<cute::Int<CTA_M_>, cute::Int<CTA_N_>, cute::Int<CTA_K_>>,                                                                                                                                                                                                                                             \
+      cute::Shape<cute::Int<CGA_M_>, cute::Int<CGA_N_>, cute::Int<CGA_K_>>, MXFPX_, DYNAMIC_CGA_,                                                                                                                                                                                                                       \
+      BIAS_, SWAP_AB_>(TmaWarpSpecializedGroupedGemmInput tma_ws_input, int num_experts,                                                                                                                                                                                                                                \
+                       int const multi_processor_count, cudaStream_t stream,                                                                                                                                                                                                                                            \
+                       int* kernel_occupancy, size_t* workspace_size,                                                                                                                                                                                                                                                   \
+                       cute::Shape<int32_t, int32_t, cute::_1> dynamic_cluster_shape,                                                                                                                                                                                                                                   \
+                       cute::Shape<int32_t, int32_t, cute::_1> fallback_cluster_shape);
 
-}  // namespace cutlass_kernels
+}  // namespace cutlass_kernels_oss
 }  // namespace kernels
 }  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h
index 16ebddca32..91d12ef0e7 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h
@@ -1,32 +1,39 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #include <cuda_runtime_api.h>
 
+#include "../../include/moe_gemm_kernels.h"
 #include "cutlass_extensions/gemm_configs.h"
 #include "cutlass_extensions/weight_only_quant_op.h"
-#include "moe_gemm_kernels.h"
 
 namespace tensorrt_llm {
 namespace kernels {
-namespace cutlass_kernels {
-
+namespace cutlass_kernels_oss {
+using tensorrt_llm::kernels::cutlass_kernels::GroupedGemmInput;
+using tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput;
 template <typename T, typename WeightType, typename GemmOutputType, typename EpilogueTag,
           typename CTAShape, typename ClusterShape, typename MainloopScheduleType,
           typename EpilogueScheduleType, cutlass::WeightOnlyQuantOp QuantOp>
 void sm90_generic_mixed_moe_gemm_kernelLauncher(
-    GroupedGemmInput<T, WeightType, GemmOutputType, GemmOutputType> inputs,
+    tensorrt_llm::kernels::cutlass_kernels::GroupedGemmInput<T, WeightType, GemmOutputType,
+                                                             GemmOutputType>
+        inputs,
     TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size);
 
-}  // namespace cutlass_kernels
+}  // namespace cutlass_kernels_oss
 }  // namespace kernels
 }  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
index e28cb7b129..8f4d2f7630 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
@@ -1,13 +1,17 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #ifdef __GNUC__  // Check if the compiler is GCC or Clang
@@ -44,28 +48,30 @@
 #pragma GCC diagnostic pop
 #endif  // __GNUC__
 
+#include "moe_gemm_tma_ws_mixed_input_launcher.h"
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h"
 #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h"
-#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h"
 
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels_oss {
+using namespace tensorrt_llm::kernels::cutlass_kernels;
 namespace tk = tensorrt_llm::common;
 namespace tkc = tensorrt_llm::cutlass_extensions;
 
 using namespace cute;
 
-namespace tensorrt_llm {
-namespace kernels {
-namespace cutlass_kernels {
-
 template <typename T, typename WeightType, typename GemmOutputType, typename EpilogueTag,
           typename CTAShape, typename ClusterShape, typename MainloopScheduleType,
           typename EpilogueScheduleType, cutlass::WeightOnlyQuantOp QuantOp>
 void sm90_generic_mixed_moe_gemm_kernelLauncher(
     GroupedGemmInput<T, WeightType, GemmOutputType, GemmOutputType> inputs,
     TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+
   /////////////////////////////////////////////////////////////////////////////////////////////////
   /// GEMM kernel configurations
   /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -181,40 +187,36 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(
   hw_info.device_id = 0;
   hw_info.sm_count = sm_count_;
 
-  if (workspace_size != nullptr) {
-    const Args args{
-        cutlass::gemm::GemmUniversalMode::kGrouped,
-        {inputs.num_experts, hopper_inputs.int4_groupwise_params.shape.problem_shapes, nullptr},
-        {reinterpret_cast<ElementB const**>(hopper_inputs.ptr_b), hopper_inputs.stride_b,
-         reinterpret_cast<ElementA const**>(hopper_inputs.ptr_a), hopper_inputs.stride_a,
-         reinterpret_cast<ElementScalePacked const**>(hopper_inputs.int4_groupwise_params.ptr_s_a),
-         hopper_inputs.int4_groupwise_params.stride_s_a, group_size},
-        {fusion_args, reinterpret_cast<ElementC const**>(hopper_inputs.ptr_c),
-         hopper_inputs.stride_c, reinterpret_cast<ElementD**>(hopper_inputs.default_epilogue.ptr_d),
-         hopper_inputs.default_epilogue.stride_d},
-        hw_info};
-    *workspace_size = gemm.get_workspace_size(args);
-    return;
-  }
-
-  assert(group_size == int(inputs.groupwise_quant_group_size));
   arguments = Args{
       cutlass::gemm::GemmUniversalMode::kGrouped,
       {inputs.num_experts, hopper_inputs.int4_groupwise_params.shape.problem_shapes, nullptr},
-      {reinterpret_cast<ElementB const**>(hopper_inputs.ptr_b), hopper_inputs.stride_b,
-       reinterpret_cast<ElementA const**>(hopper_inputs.ptr_a), hopper_inputs.stride_a,
+      {reinterpret_cast<ElementB const**>(hopper_inputs.ptr_weight),
+       reinterpret_cast<StrideB*>(hopper_inputs.stride_weight),
+       reinterpret_cast<ElementA const**>(hopper_inputs.ptr_act),
+       reinterpret_cast<StrideA*>(hopper_inputs.stride_act),
        reinterpret_cast<ElementScalePacked const**>(hopper_inputs.int4_groupwise_params.ptr_s_a),
-       hopper_inputs.int4_groupwise_params.stride_s_a, group_size},
-      {fusion_args, reinterpret_cast<ElementC const**>(hopper_inputs.ptr_c), hopper_inputs.stride_c,
-       reinterpret_cast<ElementD**>(hopper_inputs.default_epilogue.ptr_d),
-       hopper_inputs.default_epilogue.stride_d},
+       reinterpret_cast<StrideS*>(hopper_inputs.int4_groupwise_params.stride_s_a), group_size},
+      {fusion_args, reinterpret_cast<ElementC const**>(hopper_inputs.ptr_c),
+       reinterpret_cast<StrideC*>(hopper_inputs.stride_c),
+       reinterpret_cast<ElementD**>(hopper_inputs.ptr_d),
+       reinterpret_cast<StrideD*>(hopper_inputs.stride_d)},
       hw_info};
 
+  assert(group_size == int(inputs.groupwise_quant_group_size));
+  if (workspace_size != nullptr) {
+    *workspace_size = gemm.get_workspace_size(arguments);
+    return;
+  }
+
   if (gemm.get_workspace_size(arguments) > hopper_inputs.gemm_workspace_size) {
     TLLM_LOG_ERROR("[Mixed dtype WS grouped GEMM] given workspace size insufficient, %d < %d.",
                    gemm.get_workspace_size(arguments), hopper_inputs.gemm_workspace_size);
   }
 
+  // This is not initialized during workspace size calculation so check after
+  TLLM_CHECK_WITH_INFO(hopper_inputs.swap_ab,
+                       "swap_ab must be true for mixed dtype WS grouped GEMM");
+
   auto can_implement = gemm.can_implement(arguments);
   if (can_implement != cutlass::Status::kSuccess) {
     std::string err_msg = "mixed dtype WS grouped cutlass kernel will fail for params. Error: " +
@@ -239,6 +241,6 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(
   return;
 }
 
-}  // namespace cutlass_kernels
+}  // namespace cutlass_kernels_oss
 }  // namespace kernels
 }  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_bf16.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_bf16.cu
index 1a350efc15..1072cdd1fa 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_bf16.cu
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_bf16.cu
@@ -1,16 +1,20 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
-#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
+#include "moe_gemm_template_dispatch.h"
 
 namespace tensorrt_llm::kernels::cutlass_kernels {
 #ifdef ENABLE_BF16
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu
index 3d020f2618..c1d40e33ac 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
+#include "moe_gemm_template_dispatch.h"
 
 namespace tensorrt_llm::kernels::cutlass_kernels {
 #if defined(ENABLE_BF16) && defined(ENABLE_FP4)
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp8.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp8.cu
index 8fd27b4c3f..da1adbac53 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp8.cu
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp8.cu
@@ -1,16 +1,20 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
-#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
+#include "moe_gemm_template_dispatch.h"
 
 namespace tensorrt_llm::kernels::cutlass_kernels {
 #ifdef ENABLE_BF16
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint4.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint4.cu
index 98ec5e7a64..b10a7f6713 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint4.cu
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint4.cu
@@ -1,16 +1,20 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
-#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
+#include "moe_gemm_template_dispatch.h"
 
 namespace tensorrt_llm::kernels::cutlass_kernels {
 #ifdef ENABLE_BF16
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint8.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint8.cu
index 94ed59c0a6..cbf13d5f6f 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint8.cu
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint8.cu
@@ -1,16 +1,20 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
-#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
+#include "moe_gemm_template_dispatch.h"
 
 namespace tensorrt_llm::kernels::cutlass_kernels {
 #ifdef ENABLE_BF16
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp16.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp16.cu
index a3af6d6c8a..aba083585f 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp16.cu
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp16.cu
@@ -1,16 +1,20 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
-#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
+#include "moe_gemm_template_dispatch.h"
 
 namespace tensorrt_llm::kernels::cutlass_kernels {
 template class MoeGemmRunner<half, half, half>;
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu
index c4533161bd..ce4b57cc69 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
+#include "moe_gemm_template_dispatch.h"
 
 namespace tensorrt_llm::kernels::cutlass_kernels {
 #if defined(ENABLE_FP4)
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint4.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint4.cu
index 9a464b8311..d216bed89c 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint4.cu
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint4.cu
@@ -1,16 +1,20 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
-#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
+#include "moe_gemm_template_dispatch.h"
 
 namespace tensorrt_llm::kernels::cutlass_kernels {
 template class MoeGemmRunner<half, cutlass::uint4b_t, half>;
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint8.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint8.cu
index 92159da7ed..b9bdae53ac 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint8.cu
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint8.cu
@@ -1,16 +1,20 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
-#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
+#include "moe_gemm_template_dispatch.h"
 
 namespace tensorrt_llm::kernels::cutlass_kernels {
 template class MoeGemmRunner<half, uint8_t, half>;
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp32_fp32.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp32_fp32.cu
index d26e9609fd..747f9b29e8 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp32_fp32.cu
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp32_fp32.cu
@@ -1,16 +1,20 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
-#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
+#include "moe_gemm_template_dispatch.h"
 
 namespace tensorrt_llm::kernels::cutlass_kernels {
 template class MoeGemmRunner<float, float, float>;
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp4_fp4.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp4_fp4.cu
index a0137fd1c6..a8c11e0692 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp4_fp4.cu
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp4_fp4.cu
@@ -1,16 +1,20 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
-#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
+#include "moe_gemm_template_dispatch.h"
 
 namespace tensorrt_llm::kernels::cutlass_kernels {
 #ifdef ENABLE_FP4
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp4.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp4.cu
index c00b77dbc1..6bc740c5fa 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp4.cu
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp4.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
+#include "moe_gemm_template_dispatch.h"
 
 namespace tensorrt_llm::kernels::cutlass_kernels {
 #ifdef ENABLE_FP4
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp8.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp8.cu
index 7235cb5119..08b7ce1930 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp8.cu
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp8.cu
@@ -1,16 +1,20 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
-#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
+#include "moe_gemm_template_dispatch.h"
 
 namespace tensorrt_llm::kernels::cutlass_kernels {
 #ifdef ENABLE_FP8
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_uint4.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_uint4.cu
index 01a096b526..7dbf9f6265 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_uint4.cu
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_uint4.cu
@@ -1,16 +1,20 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
-#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
+#include "moe_gemm_template_dispatch.h"
 
 namespace tensorrt_llm::kernels::cutlass_kernels {
 #ifdef ENABLE_FP8
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
index 16b39246cb..dd0607e897 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
@@ -67,7 +67,7 @@
 #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h"
 #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h"
 
-namespace tensorrt_llm::kernels::cutlass_kernels {
+namespace tensorrt_llm::kernels::cutlass_kernels_oss {
 
 // ============================= Variable batched Gemm things ===========================
 template <typename T, typename WeightType, typename GemmOutputType, typename arch,
@@ -106,9 +106,10 @@ struct genericMoeGemmKernelLauncher {
 
     // The cutlass type for the input elements. This is needed to convert to cutlass::half_t if
     // necessary.
-    using ElementType = typename TllmToCutlassTypeAdapter<T>::type;
-    using CutlassGemmOutputType = typename TllmToCutlassTypeAdapter<GemmOutputType>::type;
-    using CutlassWeightType = typename TllmToCutlassTypeAdapter<WeightType>::type;
+    using ElementType = typename cutlass_kernels::TllmToCutlassTypeAdapter<T>::type;
+    using CutlassGemmOutputType =
+        typename cutlass_kernels::TllmToCutlassTypeAdapter<GemmOutputType>::type;
+    using CutlassWeightType = typename cutlass_kernels::TllmToCutlassTypeAdapter<WeightType>::type;
     if (!inputs.use_fused_moe) {
       // We need separate config for each architecture since we will target different tensorcore
       // instructions. For float, we do not target TCs.
@@ -213,9 +214,9 @@ struct genericMoeGemmKernelLauncher {
                                                                               // support fp16 or
                                                                               // bf16)
     {
-      sm80_generic_fused_moe_gemm_kernelLauncher<ElementType, CutlassWeightType,
-                                                 ThreadblockShape::kM, ThreadblockShape::kN,
-                                                 ThreadblockShape::kK, Stages, EpilogueTag>(
+      tensorrt_llm::kernels::cutlass_kernels_oss::sm80_generic_fused_moe_gemm_kernelLauncher<
+          ElementType, CutlassWeightType, ThreadblockShape::kM, ThreadblockShape::kN,
+          ThreadblockShape::kK, Stages, EpilogueTag>(
           reinterpret_cast<ElementType const*>(inputs.A),
           reinterpret_cast<CutlassWeightType const*>(inputs.B),
           reinterpret_cast<ElementType const*>(inputs.biases), inputs.bias_is_broadcast,
@@ -254,18 +255,19 @@ static void dispatch(GroupedGemmInput<T, WeightType, GemmOutputType, GemmOutputT
   if constexpr ((Stages == 2 || Arch::kMinComputeCapability >= 80) &&
                 (!isFp8 || std::is_same_v<Arch, cutlass::arch::Sm89>) && !isFp4) {
     // dispatch for quant op type
-    auto* launcher = kernels::cutlass_kernels::genericMoeGemmKernelLauncher<
+    auto* launcher = tensorrt_llm::kernels::cutlass_kernels_oss::genericMoeGemmKernelLauncher<
         T, WeightType, GemmOutputType, Arch, cutlass::WeightOnlyQuantOp::UNDEFINED, EpilogueTag,
         ThreadblockShape, WarpShape, Stages>::call;
     if (!std::is_same_v<WeightType, T> && inputs.groupwise_quant_group_size > 0) {
-      launcher = inputs.zeros ? kernels::cutlass_kernels::genericMoeGemmKernelLauncher<
-                                    T, WeightType, GemmOutputType, Arch,
-                                    cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS,
-                                    EpilogueTag, ThreadblockShape, WarpShape, Stages>::call
-                              : kernels::cutlass_kernels::genericMoeGemmKernelLauncher<
-                                    T, WeightType, GemmOutputType, Arch,
-                                    cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, EpilogueTag,
-                                    ThreadblockShape, WarpShape, Stages>::call;
+      launcher = inputs.zeros
+                     ? tensorrt_llm::kernels::cutlass_kernels_oss::genericMoeGemmKernelLauncher<
+                           T, WeightType, GemmOutputType, Arch,
+                           cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS, EpilogueTag,
+                           ThreadblockShape, WarpShape, Stages>::call
+                     : tensorrt_llm::kernels::cutlass_kernels_oss::genericMoeGemmKernelLauncher<
+                           T, WeightType, GemmOutputType, Arch,
+                           cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY, EpilogueTag,
+                           ThreadblockShape, WarpShape, Stages>::call;
     }
     launcher(inputs, sm_count_);
   } else {
@@ -519,17 +521,23 @@ void dispatchMoeGemmToCutlass(
   }
 }
 
+}  // namespace tensorrt_llm::kernels::cutlass_kernels_oss
+
+namespace tensorrt_llm::kernels::cutlass_kernels {
+
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
 std::vector<cutlass_extensions::CutlassGemmConfig>
-MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getConfigs() const {
-  return getConfigs(sm_);
+MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getConfigs(
+    bool supports_finalize_fusion) const {
+  return getConfigs(sm_, supports_finalize_fusion);
 }
 
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
 std::vector<cutlass_extensions::CutlassGemmConfig>
-MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getConfigs(int sm) {
+MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getConfigs(int sm,
+                                                                    bool supports_finalize_fusion) {
   std::vector<cutlass_extensions::CutlassGemmConfig> candidate_configs =
-      getTmaWarpSpecializedConfigs(sm);
+      getTmaWarpSpecializedConfigs(sm, supports_finalize_fusion);
   std::vector<cutlass_extensions::CutlassGemmConfig> ampere_configs = getAmpereConfigs(sm);
   std::copy(ampere_configs.begin(), ampere_configs.end(), std::back_inserter(candidate_configs));
   return candidate_configs;
@@ -552,19 +560,21 @@ MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getAmpereConfigs(int sm
   auto config_type_param = static_cast<CutlassGemmConfig::CandidateConfigTypeParam>(
       weight_only_flag | simt_only_flag | grouped_gemm_flag | enable_hopper | fp8_only_flag);
 
-  if (!kernels::cutlass_kernels::isValidAmpereMOESpecialisation<T, WeightType>() ||
+  if (!tensorrt_llm::kernels::cutlass_kernels::isValidAmpereMOESpecialisation<T, WeightType>() ||
       (use_w4afp8 && sm != 89) || use_wfp4a16) {
     return {};
   }
 
   std::vector<cutlass_extensions::CutlassGemmConfig> ampere_configs =
-      kernels::cutlass_kernels::get_candidate_configs(sm, max_split_k, config_type_param);
+      tensorrt_llm::kernels::cutlass_kernels::get_candidate_configs(sm, max_split_k,
+                                                                    config_type_param);
   return ampere_configs;
 }
 
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
 std::vector<cutlass_extensions::CutlassGemmConfig>
-MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getTmaWarpSpecializedConfigs(int sm) {
+MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getTmaWarpSpecializedConfigs(
+    int sm, bool supports_finalize_fusion) {
   using tensorrt_llm::cutlass_extensions::CutlassGemmConfig;
   static constexpr auto weight_only_flag =
       std::is_same<T, WeightType>::value ? CutlassGemmConfig::NONE : CutlassGemmConfig::WEIGHT_ONLY;
@@ -577,28 +587,32 @@ MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getTmaWarpSpecializedCo
   static constexpr auto fp8_only_flag =
       use_fp8 ? CutlassGemmConfig::FP8_ONLY : CutlassGemmConfig::NONE;
   static constexpr auto fp4_only_flag =
-      (use_fp4 || use_wfp4afp4) ? CutlassGemmConfig::FP4_ONLY : CutlassGemmConfig::NONE;
+      (use_fp4 || use_wfp4afp8) ? CutlassGemmConfig::FP4_ONLY : CutlassGemmConfig::NONE;
+  static constexpr auto fp8fp4_mixed_flag =
+      use_wfp4afp8 ? CutlassGemmConfig::FP8FP4_MIXED : CutlassGemmConfig::NONE;
   auto config_type_param = static_cast<CutlassGemmConfig::CandidateConfigTypeParam>(
       weight_only_flag | simt_only_flag | grouped_gemm_flag | enable_blackwell | enable_hopper |
-      fp8_only_flag | fp4_only_flag);
+      fp8_only_flag | fp4_only_flag | fp8fp4_mixed_flag);
   TLLM_CHECK_WITH_INFO(!(enable_blackwell && enable_hopper),
                        "Blackwell and hopper flags are mutually exclusive");
 
+  sm = use_wfp4afp8 && sm == 103 ? 100 : sm;
   if (sm >= 100 && sm < 120 &&
-      !kernels::cutlass_kernels::isValidBlackwellMOESpecialisation<T, WeightType>()) {
+      !tensorrt_llm::kernels::cutlass_kernels::isValidBlackwellMOESpecialisation<T, WeightType>()) {
     TLLM_LOG_TRACE(
         "Blackwell is not supported for this configuration, not selecting any TMA WS "
         "implementations");
     return {};
   }
   if ((sm == 120 || sm == 121) &&
-      !kernels::cutlass_kernels::isValidSM120MOESpecialisation<T, WeightType>()) {
+      !tensorrt_llm::kernels::cutlass_kernels::isValidSM120MOESpecialisation<T, WeightType>()) {
     TLLM_LOG_TRACE(
         "Blackwell SM120 is not supported for this configuration, not selecting any TMA WS "
         "implementations");
     return {};
   }
-  if (enable_hopper && !kernels::cutlass_kernels::isValidHopperMOESpecialisation<T, WeightType>()) {
+  if (enable_hopper &&
+      !tensorrt_llm::kernels::cutlass_kernels::isValidHopperMOESpecialisation<T, WeightType>()) {
     TLLM_LOG_TRACE(
         "Hopper is not supported for this configuration, not selecting any TMA WS implementations");
     return {};
@@ -606,6 +620,51 @@ MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getTmaWarpSpecializedCo
 
   std::vector<cutlass_extensions::CutlassGemmConfig> tma_ws_configs =
       kernels::cutlass_kernels::get_candidate_configs(sm, max_split_k, config_type_param);
+
+  if (sm == 103 && use_fp4) {
+    // Explicitly select SM100 as well
+    auto sm100_configs = tensorrt_llm::kernels::cutlass_kernels::get_candidate_configs(
+        100, max_split_k, config_type_param);
+    std::copy(sm100_configs.begin(), sm100_configs.end(), std::back_inserter(tma_ws_configs));
+  }
+
+  if (supports_finalize_fusion) {
+    // Duplicate the configs and set the epilogue fusion type to FINALIZE
+    auto finalize_configs = tma_ws_configs;
+    std::transform(finalize_configs.begin(), finalize_configs.end(),
+                   std::back_inserter(tma_ws_configs), [](auto& config) {
+                     config.epilogue_fusion_type =
+                         cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE;
+                     return config;
+                   });
+
+    // Finalize fusion is only supported for TMA epilogue schedule
+    tma_ws_configs.erase(
+        std::remove_if(
+            tma_ws_configs.begin(), tma_ws_configs.end(),
+            [](auto& config) {
+              return config.epilogue_fusion_type ==
+                         cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE &&
+                     config.epilogue_schedule == cutlass_extensions::EpilogueScheduleType::NO_SMEM;
+            }),
+        tma_ws_configs.end());
+  }
+
+  auto swap_ab_configs = tma_ws_configs;
+  std::transform(swap_ab_configs.begin(), swap_ab_configs.end(), std::back_inserter(tma_ws_configs),
+                 [](auto& config) {
+                   TLLM_CHECK_WITH_INFO(!config.swap_ab, "Swap AB is already set");
+                   config.swap_ab = true;
+                   return config;
+                 });
+
+  if (use_w4_groupwise) {
+    // w4 groupwise implementation requires swap_ab to be true
+    tma_ws_configs.erase(std::remove_if(tma_ws_configs.begin(), tma_ws_configs.end(),
+                                        [](auto& config) { return !config.swap_ab; }),
+                         tma_ws_configs.end());
+  }
+
   return tma_ws_configs;
 }
 
@@ -617,12 +676,15 @@ bool MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::isTmaWarpSpecializ
 }
 
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
-bool MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::supportsTmaWarpSpecialized() const {
-  return (sm_ == 90 && kernels::cutlass_kernels::isValidHopperMOESpecialisation<T, WeightType>()) ||
-         (sm_ >= 100 && sm_ < 120 &&
-          kernels::cutlass_kernels::isValidBlackwellMOESpecialisation<T, WeightType>()) ||
-         ((sm_ == 120 || sm_ == 121) &&
-          kernels::cutlass_kernels::isValidSM120MOESpecialisation<T, WeightType>());
+bool MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::supportsTmaWarpSpecialized(int sm) {
+  return (sm == 90 &&
+          tensorrt_llm::kernels::cutlass_kernels::isValidHopperMOESpecialisation<T,
+                                                                                 WeightType>()) ||
+         (sm >= 100 && sm < 120 &&
+          tensorrt_llm::kernels::cutlass_kernels::isValidBlackwellMOESpecialisation<
+              T, WeightType>()) ||
+         ((sm == 120 || sm == 121) &&
+          tensorrt_llm::kernels::cutlass_kernels::isValidSM120MOESpecialisation<T, WeightType>());
 }
 
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
@@ -677,63 +739,66 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
                        "Hopper configuration provided for non-Hopper architecture");
 
   if (sm_ >= 75 && sm_ < 80) {
-#ifdef ENABLE_FP4
-    if constexpr (!std::is_same_v<WeightType, __nv_fp4_e2m1>) {
-      dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm75, EpilogueTag>(
+#if defined(ENABLE_FP4)
+    constexpr bool is_fp4 = std::is_same_v<WeightType, __nv_fp4_e2m1>;
+#else
+    constexpr bool is_fp4 = false;
+#endif
+    if constexpr (!is_fp4) {
+      cutlass_kernels_oss::dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType,
+                                                    cutlass::arch::Sm75, EpilogueTag>(
           inputs, multi_processor_count_);
     } else {
       TLLM_THROW("FP4 data type is not supported on SM < 90");
     }
+  } else if (sm_ >= 80 && sm_ < 90) {
+#if defined(ENABLE_FP4)
+    constexpr bool is_fp4 = std::is_same_v<WeightType, __nv_fp4_e2m1>;
 #else
-    TLLM_THROW("FP4 data type is not supported on SM < 90");
+    constexpr bool is_fp4 = false;
 #endif
-  } else if (sm_ >= 80 && sm_ < 90) {
-    if constexpr (use_fp8 || use_w4afp8) {
+    if constexpr (!is_fp4) {
+      if constexpr (use_fp8 || use_w4afp8) {
 #if defined(ENABLE_FP8)
-      static_assert(
-          !std::is_same_v<OutputType, __nv_fp8_e4m3> && !std::is_same_v<OutputType, __nv_fp8_e5m2>,
-          "FP8 GEMM Output not supported");
+        static_assert(!std::is_same_v<OutputType, __nv_fp8_e4m3> &&
+                          !std::is_same_v<OutputType, __nv_fp8_e5m2>,
+                      "FP8 GEMM Output not supported");
 #endif
-      TLLM_CHECK_WITH_INFO(sm_ == 89, "For sm >= 80 and < 90, fp8 is only supported with sm == 89");
-      dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm89, EpilogueTag>(
-          inputs, multi_processor_count_);
-    } else {
-#ifdef ENABLE_FP4
-      if constexpr (std::is_same_v<WeightType, __nv_fp4_e2m1>) {
-        TLLM_THROW("FP4 data type is not supported on SM < 90");
+
+        TLLM_CHECK_WITH_INFO(sm_ == 89,
+                             "For sm >= 80 and < 90, fp8 is only supported with sm == 89");
+        cutlass_kernels_oss::dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType,
+                                                      cutlass::arch::Sm89, EpilogueTag>(
+            inputs, multi_processor_count_);
       } else {
-        dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm80, EpilogueTag>(
+        cutlass_kernels_oss::dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType,
+                                                      cutlass::arch::Sm80, EpilogueTag>(
             inputs, multi_processor_count_);
       }
-#else
-      dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm80, EpilogueTag>(
-          inputs, multi_processor_count_);
-#endif
+    } else {
+      TLLM_THROW("FP4 data type is not supported on SM < 90");
     }
   } else if (sm_ >= 90) {
-    // For SM120+ FP8 MoE, redirect to SM89 (Ada) FP8 kernel implementations.
-    if constexpr (use_fp8) {
+    // For SM120+ pure FP8 MoE (not FP8 x FP4), redirect to SM89 (Ada) FP8 kernel implementations.
+    if constexpr (use_fp8 && !use_wfp4afp8) {
       if (sm_ >= 120) {
-        dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm89, EpilogueTag>(
+        cutlass_kernels_oss::dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType,
+                                                      cutlass::arch::Sm89, EpilogueTag>(
             inputs, multi_processor_count_);
         return;
       }
     }
 
-    if constexpr (kernels::cutlass_kernels::isValidTmaWarpSpecializedMOESpecialisation<
-                      T, WeightType, EpilogueTag>() &&
+    if constexpr (tensorrt_llm::kernels::cutlass_kernels::
+                      isValidTmaWarpSpecializedMOESpecialisation<T, WeightType, EpilogueTag>() &&
                   !use_w4_groupwise) {
       // We allow both tma warp specialized and SM80 configurations to coexist because for some
       // cases with small numbers of tokens SM80 is faster. We check here to see which is selected
       if (inputs.gemm_config.sm_version >= 90) {
-        bool is_same_sm = inputs.gemm_config.sm_version == sm_;
-        // gemm_config.sm_version indicates the kernel pipeline, which is always 100 for 100, 103,
-        // 110 below logging helps confirming the cutlass pipeline matches the device major version
-        bool is_sm110 = inputs.gemm_config.sm_version == 100 && sm_ == 110;
-        bool is_sm103 = inputs.gemm_config.sm_version == 100 && sm_ == 103;
-        // SM120 and SM121 are architecturally identical
-        bool is_sm120 = (inputs.gemm_config.sm_version == 120) && (sm_ == 120 || sm_ == 121);
-        TLLM_CHECK_WITH_INFO(is_same_sm || is_sm110 || is_sm103 || is_sm120,
+        // Check the major version of the SM matches
+        TLLM_CHECK_WITH_INFO((inputs.gemm_config.sm_version / 10 == sm_ / 10) ||
+                                 // allow sm100 configs to run on sm110 as well
+                                 (inputs.gemm_config.sm_version / 10 == 10 && sm_ / 10 == 11),
                              "Using SM %d configuration for SM %d device",
                              inputs.gemm_config.sm_version, sm_);
         TLLM_CHECK_WITH_INFO(inputs.biases != nullptr || hopper_inputs.ptr_c == nullptr,
@@ -746,11 +811,11 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
         auto select_function = [&]() {
           switch (hopper_inputs.fusion) {
             case TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE:
-              return &dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<
+              return &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<
                   T, WeightType, OutputType, EpilogueTag,
                   TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE>;
             case TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE:
-              return &dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<
+              return &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<
                   T, WeightType, OutputType, EpilogueTag,
                   TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE>;
             case TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::ACTIVATION:
@@ -775,16 +840,16 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
                            "w4afp8 is only supported for TMA warp specialization");
       // EpilogueTag is ignored
       if (inputs.k % 512 == 0) {
-        sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-                                                      cutlass_extensions::EpilogueOpDefault, 4>(
+        cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<
+            T, WeightType, ScaleBiasType, cutlass_extensions::EpilogueOpDefault, 4>(
             inputs, hopper_inputs, multi_processor_count_, nullptr);
       } else if (inputs.k % 256 == 0) {
-        sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-                                                      cutlass_extensions::EpilogueOpDefault, 2>(
+        cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<
+            T, WeightType, ScaleBiasType, cutlass_extensions::EpilogueOpDefault, 2>(
             inputs, hopper_inputs, multi_processor_count_, nullptr);
       } else if (inputs.k % 128 == 0) {
-        sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-                                                      cutlass_extensions::EpilogueOpDefault, 1>(
+        cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<
+            T, WeightType, ScaleBiasType, cutlass_extensions::EpilogueOpDefault, 1>(
             inputs, hopper_inputs, multi_processor_count_, nullptr);
       } else {
         TLLM_THROW("Invalid GEMM K size %d", (int)inputs.k);
@@ -796,16 +861,16 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
       TLLM_CHECK_WITH_INFO(inputs.gemm_config.is_tma_warp_specialized,
                            "wfp4a16 is only supported for TMA warp specialization");
       // EpilogueTag is ignored
-      sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-                                                    cutlass_extensions::EpilogueOpDefault, 1>(
+      cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<
+          T, WeightType, ScaleBiasType, cutlass_extensions::EpilogueOpDefault, 1>(
           inputs, hopper_inputs, multi_processor_count_, nullptr);
       return;
     }
 #endif
 
     // Do Ampere case instead
-    if constexpr (kernels::cutlass_kernels::isValidAmpereMOESpecialisation<T, WeightType,
-                                                                           EpilogueTag>()) {
+    if constexpr (tensorrt_llm::kernels::cutlass_kernels::isValidAmpereMOESpecialisation<
+                      T, WeightType, EpilogueTag>()) {
       TLLM_CHECK_WITH_INFO(!use_fp8, "No fallback FP8 implementation available");
       TLLM_CHECK_WITH_INFO(use_w4afp8 || !hopper_inputs.isValid(),
                            "Non-specialized Hopper implementation is being rerouted to fallback "
@@ -818,10 +883,12 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
                            "Using SM %d configuration for SM80 fallback implementation",
                            inputs.gemm_config.sm_version);
       if constexpr (use_fp8) {
-        dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm89, EpilogueTag>(
+        cutlass_kernels_oss::dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType,
+                                                      cutlass::arch::Sm89, EpilogueTag>(
             inputs, multi_processor_count_);
       } else {
-        dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm80, EpilogueTag>(
+        cutlass_kernels_oss::dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType,
+                                                      cutlass::arch::Sm80, EpilogueTag>(
             inputs, multi_processor_count_);
       }
     } else {
@@ -848,18 +915,21 @@ template <typename T, typename WeightType, typename OutputType, typename ScaleBi
 size_t MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::calcMaxWorkspaceSize(
     int num_experts) const {
   if constexpr (use_w4_groupwise) {
-    return calcMaxWorkspaceSizeTmaWarpSpecializedMixedInput<T, WeightType, OutputType>(
+    return cutlass_kernels_oss::calcMaxWorkspaceSizeTmaWarpSpecializedMixedInput<T, WeightType,
+                                                                                 OutputType>(
         num_experts, multi_processor_count_);
   }
   if (!supportsTmaWarpSpecialized()) {
     return 0;
   }
-  if constexpr (kernels::cutlass_kernels::isValidTmaWarpSpecializedMOESpecialisation<
+  if constexpr (tensorrt_llm::kernels::cutlass_kernels::isValidTmaWarpSpecializedMOESpecialisation<
                     T, WeightType>() &&
                 !use_w4afp8 && !use_wfp4a16) {
-    auto configs = getTmaWarpSpecializedConfigs(sm_);
+    // Finalize fusion may not actually be supported by the kernel,
+    // if they are not we will catch the error and skip them
+    auto configs = getTmaWarpSpecializedConfigs(sm_, true);
     auto fpX_block_scaling_type = TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::NONE;
-    if constexpr (use_wfp4afp4) {
+    if constexpr (use_wfp4afp8) {
       fpX_block_scaling_type = TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::MXFPX;
     } else if (use_fp4) {
       fpX_block_scaling_type = TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::NVFP4;
@@ -867,17 +937,19 @@ size_t MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::calcMaxWorkspace
     size_t max_size = 0;
     bool has_config = false;
     for (auto conf : configs) {
-#define CALC_SIZE_FUSION(FUSION)                                                               \
-  do {                                                                                         \
-    try {                                                                                      \
-      size_t size = calcMaxWorkspaceSizeTmaWarpSpecialized<T, WeightType, OutputType, FUSION>( \
-          num_experts, conf, multi_processor_count_, fpX_block_scaling_type);                  \
-      max_size = std::max(max_size, size);                                                     \
-      has_config = true;                                                                       \
-    } catch (tensorrt_llm::common::TllmException const& e) {                                   \
-      TLLM_LOG_TRACE("Unsupported config skipped when calculating MOE workspace size %s",      \
-                     e.what());                                                                \
-    }                                                                                          \
+#define CALC_SIZE_FUSION(FUSION)                                                                 \
+  do {                                                                                           \
+    try {                                                                                        \
+      size_t size =                                                                              \
+          cutlass_kernels_oss::calcMaxWorkspaceSizeTmaWarpSpecialized<T, WeightType, OutputType, \
+                                                                      FUSION>(                   \
+              num_experts, conf, multi_processor_count_, fpX_block_scaling_type);                \
+      max_size = std::max(max_size, size);                                                       \
+      has_config = true;                                                                         \
+    } catch (tensorrt_llm::common::TllmException const& e) {                                     \
+      TLLM_LOG_TRACE("Unsupported config skipped when calculating MOE workspace size %s",        \
+                     e.what());                                                                  \
+    }                                                                                            \
   } while (0)
 
       CALC_SIZE_FUSION(TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE);
@@ -927,9 +999,6 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::moeGemmBiasAct(
     case ActivationType::Geglu:
       runGemm<cutlass_extensions::EpilogueOpDefaultFtGelu>(inputs, hopper_inputs);
       break;
-    case ActivationType::Relu2:
-      TLLM_THROW("Relu2 is not supported.");
-      break;
     case ActivationType::InvalidType:
       TLLM_THROW("Activation type for fpA_intB must be valid.");
       break;
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h
index c764cb6c90..5adacd0ce2 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h
@@ -49,6 +49,7 @@
 #include <cuda_fp16.h>
 #include <math.h>
 
+#include <mutex>
 #include <sstream>
 
 #include "../include/moe_gemm_kernels.h"
@@ -59,15 +60,63 @@
 #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h"
 #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h"
 
-namespace tensorrt_llm::kernels::cutlass_kernels {
+namespace tensorrt_llm::kernels::cutlass_kernels_oss {
+using tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput;
 using EpilogueFusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion;
 
+template <typename Arch, typename T, typename WeightType, typename OutputType, typename EpilogueTag,
+          EpilogueFusion FUSION, typename TileShape, typename ClusterShape, bool is_wfp4afp8>
+auto getDispatchFunctionForSM100(cutlass_extensions::EpilogueScheduleType epilogue_schedule,
+                                 bool dynamic_cga, bool swap_ab) {
+  auto select_swap_ab = [dynamic_cga, epilogue_schedule](auto swap_ab_t) {
+    auto select_dynamic_cga = [epilogue_schedule](auto dynamic_cga_t) {
+#if defined(ENABLE_FP4)
+      constexpr bool is_block_scaled =
+          std::is_same_v<T, __nv_fp4_e2m1> || std::is_same_v<WeightType, __nv_fp4_e2m1>;
+#else
+      constexpr bool is_block_scaled = false;
+#endif
+      if constexpr ((!is_block_scaled || Arch::kMinComputeCapability == 103) &&
+                    FUSION != EpilogueFusion::FINALIZE) {
+        auto func_map = std::array{
+            &kernels::cutlass_kernels_oss::tma_warp_specialized_generic_moe_gemm_kernelLauncher<
+                Arch, T, WeightType, OutputType, cutlass::epilogue::PtrArrayNoSmemWarpSpecialized,
+                EpilogueTag, FUSION, TileShape, ClusterShape, is_wfp4afp8,
+                decltype(dynamic_cga_t)::value, false, decltype(swap_ab_t)::value>,
+            &kernels::cutlass_kernels_oss::tma_warp_specialized_generic_moe_gemm_kernelLauncher<
+                Arch, T, WeightType, OutputType, cutlass::epilogue::PtrArrayTmaWarpSpecialized,
+                EpilogueTag, FUSION, TileShape, ClusterShape, is_wfp4afp8,
+                decltype(dynamic_cga_t)::value, false, decltype(swap_ab_t)::value>
+
+        };
+        bool const tma_epilogue =
+            epilogue_schedule == cutlass_extensions::EpilogueScheduleType::TMA;
+        return func_map[tma_epilogue];
+      } else {
+        static_assert(FUSION == EpilogueFusion::FINALIZE || Arch::kMinComputeCapability != 103,
+                      "SM103 should support both epilogue schedules");
+        TLLM_CHECK_WITH_INFO(
+            epilogue_schedule == cutlass_extensions::EpilogueScheduleType::TMA,
+            "No Smem epilogue schedule is not supported for block scaled types or finalize fusion");
+        return &kernels::cutlass_kernels_oss::tma_warp_specialized_generic_moe_gemm_kernelLauncher<
+            Arch, T, WeightType, OutputType, cutlass::epilogue::PtrArrayTmaWarpSpecialized,
+            EpilogueTag, FUSION, TileShape, ClusterShape, is_wfp4afp8,
+            decltype(dynamic_cga_t)::value, false, decltype(swap_ab_t)::value>;
+      }
+    };
+    return dynamic_cga ? select_dynamic_cga(tensorrt_llm::common::ConstBool<true>{})
+                       : select_dynamic_cga(tensorrt_llm::common::ConstBool<false>{});
+  };
+  return swap_ab ? select_swap_ab(tensorrt_llm::common::ConstBool<true>{})
+                 : select_swap_ab(tensorrt_llm::common::ConstBool<false>{});
+}
+
 template <typename Arch, typename T, typename WeightType, typename OutputType, typename EpilogueTag,
           EpilogueFusion FUSION, typename TileShape, typename ClusterShape>
-void dispatchMoeGemmSelectBiasTmaWarpSpecialized(TmaWarpSpecializedGroupedGemmInput hopper_input,
-                                                 int num_experts, int multi_processor_count,
-                                                 cudaStream_t stream, int* occupancy,
-                                                 size_t* workspace_size) {
+void dispatchMoeGemmFinalDispatchTmaWarpSpecialized(
+    TmaWarpSpecializedGroupedGemmInput hopper_input, int num_experts,
+    cutlass_extensions::CutlassGemmConfig gemm_config, int multi_processor_count,
+    cudaStream_t stream, int* occupancy, size_t* workspace_size) {
   static_assert(
       (Arch::kMinComputeCapability == 90 &&
        kernels::cutlass_kernels::isValidHopperMOESpecialisation<T, WeightType, EpilogueTag>()) ||
@@ -79,15 +128,6 @@ void dispatchMoeGemmSelectBiasTmaWarpSpecialized(TmaWarpSpecializedGroupedGemmIn
   TLLM_CHECK_WITH_INFO(workspace_size || hopper_input.isValid(),
                        "Hopper specialisation is missing additional input information");
 
-  //            auto func = hopper_input.ptr_c ?
-  //            kernels::cutlass_kernels::genericMoeGemmKernelLauncherHopper<T, WeightType,
-  //                            cutlass::arch::Sm90, EpilogueTag, true>
-  //                                           :
-  //                                           kernels::cutlass_kernels::genericMoeGemmKernelLauncherHopper<T,
-  //                                           WeightType,
-  //                                               cutlass::arch::Sm90, EpilogueTag, false>;
-  // TODO Re-enable bias when CUTLASS supports it
-
   if constexpr (Arch::kMinComputeCapability < 90) {
     TLLM_THROW("Invalid architecture instantiated");
   }
@@ -98,6 +138,13 @@ void dispatchMoeGemmSelectBiasTmaWarpSpecialized(TmaWarpSpecializedGroupedGemmIn
         "build_wheel.py.");
   }
 #endif
+#ifndef COMPILE_BLACKWELL_SM103_TMA_GROUPED_GEMMS
+  else if constexpr (Arch::kMinComputeCapability == 103) {
+    TLLM_THROW(
+        "Please recompile with support for blackwell by passing 103-real as an arch to "
+        "build_wheel.py.");
+  }
+#endif
 #ifndef COMPILE_BLACKWELL_TMA_GROUPED_GEMMS
   else if constexpr (Arch::kMinComputeCapability >= 100 && Arch::kMinComputeCapability < 120) {
     TLLM_THROW(
@@ -113,39 +160,91 @@ void dispatchMoeGemmSelectBiasTmaWarpSpecialized(TmaWarpSpecializedGroupedGemmIn
   }
 #endif
   else {
-#ifdef ENABLE_FP4
-    auto getFunc = [&]() {
-      if constexpr (std::is_same_v<T, __nv_fp8_e4m3> && std::is_same_v<WeightType, __nv_fp4_e2m1>) {
-        TLLM_CHECK_WITH_INFO(hopper_input.fpX_block_scaling_type ==
-                                 TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::MXFPX,
-                             "MXFPX is the only supported scaling type for WFP4AFP8");
-        return &kernels::cutlass_kernels::tma_warp_specialized_generic_moe_gemm_kernelLauncher<
-            Arch, T, WeightType, OutputType, EpilogueTag, FUSION, TileShape, ClusterShape, true,
-            false>;
-      } else {
-        TLLM_CHECK_WITH_INFO(hopper_input.fpX_block_scaling_type !=
-                                 TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::MXFPX,
-                             "MXFPX is not supported for the selected weight combination");
-        return &kernels::cutlass_kernels::tma_warp_specialized_generic_moe_gemm_kernelLauncher<
-            Arch, T, WeightType, OutputType, EpilogueTag, FUSION, TileShape, ClusterShape, false,
-            false>;
-      }
-    };
-    getFunc()(hopper_input, num_experts, multi_processor_count, stream, occupancy, workspace_size);
+#if defined(ENABLE_FP4)
+    constexpr static bool is_wfp4afp8 =
+        std::is_same_v<T, __nv_fp8_e4m3> && std::is_same_v<WeightType, __nv_fp4_e2m1>;
 #else
-    TLLM_THROW("FP4 data type is not supported on this architecture and CUDA version");
+    constexpr static bool is_wfp4afp8 = false;
 #endif
+    if constexpr (is_wfp4afp8) {
+      TLLM_CHECK_WITH_INFO(hopper_input.fpX_block_scaling_type ==
+                               TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::MXFPX,
+                           "MXFPX is the only supported scaling type for WFP4AFP8");
+    } else {
+      TLLM_CHECK_WITH_INFO(hopper_input.fpX_block_scaling_type !=
+                               TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::MXFPX,
+                           "MXFPX is not supported for the selected weight combination");
+    }
+
+    if constexpr (Arch::kMinComputeCapability >= 100 && Arch::kMinComputeCapability < 120) {
+      bool const dynamic_cga =
+          gemm_config.dynamic_cluster_shape != cutlass_extensions::ClusterShape::Undefined;
+      bool const swap_ab = hopper_input.swap_ab;
+      auto cluster_shape =
+          cutlass_extensions::enum_to_shape_tuple(gemm_config.dynamic_cluster_shape);
+      auto cluster_shape_cute = cute::Shape<int32_t, int32_t, cute::_1>{
+          std::get<0>(cluster_shape), std::get<1>(cluster_shape), cute::_1{}};
+      auto cluster_shape_fallback =
+          cutlass_extensions::enum_to_shape_tuple(gemm_config.fallback_cluster_shape);
+      auto cluster_shape_cute_fallback = cute::Shape<int32_t, int32_t, cute::_1>{
+          std::get<0>(cluster_shape_fallback), std::get<1>(cluster_shape_fallback), cute::_1{}};
+
+      // HACK debug the gemm_config used to produce selected_func
+      // std::cout << "[SM100 gemm_config] sm_version=" << gemm_config.sm_version
+      //           << ", tile_config_sm100=" << static_cast<int>(gemm_config.tile_config_sm100)
+      //           << ", epilogue_schedule=" << static_cast<int>(gemm_config.epilogue_schedule)
+      //           << ", dynamic_cluster_shape=" <<
+      //           static_cast<int>(gemm_config.dynamic_cluster_shape)
+      //           << ", fallback_cluster_shape="
+      // << static_cast<int>(gemm_config.fallback_cluster_shape) << std::endl;
+
+      auto selected_func =
+          getDispatchFunctionForSM100<Arch, T, WeightType, OutputType, EpilogueTag, FUSION,
+                                      TileShape, ClusterShape, is_wfp4afp8>(
+              gemm_config.epilogue_schedule, dynamic_cga, swap_ab);
+      selected_func(hopper_input, num_experts, multi_processor_count, stream, occupancy,
+                    workspace_size, cluster_shape_cute, cluster_shape_cute_fallback);
+    } else if constexpr (Arch::kMinComputeCapability >= 120 || Arch::kMinComputeCapability == 90) {
+      using EpilogueSchedule = void;  // These are hardcoded in the launcher
+      constexpr bool dynamic_cga = false;
+      auto selected_func =
+          hopper_input.swap_ab
+              ? kernels::cutlass_kernels_oss::tma_warp_specialized_generic_moe_gemm_kernelLauncher<
+                    Arch, T, WeightType, OutputType, EpilogueSchedule, EpilogueTag, FUSION,
+                    TileShape, ClusterShape, is_wfp4afp8, dynamic_cga, false, true>
+              : kernels::cutlass_kernels_oss::tma_warp_specialized_generic_moe_gemm_kernelLauncher<
+                    Arch, T, WeightType, OutputType, EpilogueSchedule, EpilogueTag, FUSION,
+                    TileShape, ClusterShape, is_wfp4afp8, dynamic_cga, false, false>;
+
+      selected_func(hopper_input, num_experts, multi_processor_count, stream, occupancy,
+                    workspace_size, {}, {});
+    }
   }
 }
 
-template <typename ClusterTileShape, typename ClusterShape, typename DataType, typename WeightType>
+template <typename Arch, typename CtaShape, typename ClusterShape, typename DataType,
+          typename WeightType>
 constexpr bool are_tile_shapes_supported_sm100() {
+  // We use a runtime cluster shape for SM100, so we only support 1x1x1 and 2x1x1 cluster shapes.
+  if (cute::size<0>(ClusterShape{}) > 2 || cute::size<1>(ClusterShape{}) != 1 ||
+      cute::size<2>(ClusterShape{}) != 1) {
+    return false;
+  }
+
   using namespace cute;
-  using CtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
   // This is the epilogue shape. The MMA shape will be twice this for 2SM
   constexpr auto TileM = size<0>(CtaShape{});
   constexpr auto TileN = size<1>(CtaShape{});
 
+  if constexpr (Arch::kMinComputeCapability == 103) {
+#if defined(ENABLE_FP4)
+    return std::is_same_v<DataType, __nv_fp4_e2m1> && std::is_same_v<WeightType, __nv_fp4_e2m1> &&
+           TileM == 128 && (TileN == 128 || TileN == 256);
+#else
+    return false;
+#endif
+  }
+
   if constexpr (TileM != 64 && TileM != 128) {
     return false;
   }
@@ -181,14 +280,13 @@ constexpr bool are_tile_shapes_supported_sm100() {
   return true;
 }
 
-template <typename ClusterTileShape, typename ClusterShape, typename DataType>
+template <typename CtaShape, typename ClusterShape, typename DataType>
 constexpr bool are_tile_shapes_supported_sm120() {
   using namespace cute;
   if constexpr (cute::size<0>(ClusterShape{}) != 1 || cute::size<1>(ClusterShape{}) != 1 ||
                 cute::size<2>(ClusterShape{}) != 1) {
     return false;
   }
-  using CtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
   // This is the epilogue shape. The MMA shape will be twice this for 2SM
   constexpr auto TileM = size<0>(CtaShape{});
   constexpr auto TileN = size<1>(CtaShape{});
@@ -216,7 +314,7 @@ template <typename Arch, typename CTAShape, typename ClusterShape, typename Data
           typename WeightType>
 constexpr bool are_tile_shapes_supported() {
   if constexpr (Arch::kMinComputeCapability >= 100 && Arch::kMinComputeCapability < 120) {
-    return are_tile_shapes_supported_sm100<CTAShape, ClusterShape, DataType, WeightType>();
+    return are_tile_shapes_supported_sm100<Arch, CTAShape, ClusterShape, DataType, WeightType>();
   } else if constexpr (Arch::kMinComputeCapability == 120 || Arch::kMinComputeCapability == 121) {
     return are_tile_shapes_supported_sm120<CTAShape, ClusterShape, DataType>();
   }
@@ -247,14 +345,16 @@ void dispatchMoeGemmSelectClusterShapeTmaWarpSpecialized(
     cutlass_extensions::CutlassGemmConfig gemm_config, int multi_processor_count,
     cudaStream_t stream, int* occupancy, size_t* workspace_size) {
   using namespace cute;
+  // This uses the fallback cluster shape for sm100 if a dynamic cluster shape is requested.
   switch (gemm_config.cluster_shape) {
 #define SHAPE_CASE(M, N, K)                                                                        \
   case cutlass_extensions::ClusterShape::ClusterShape_##M##x##N##x##K: {                           \
     using ClusterShape = Shape<_##M, _##N, _##K>;                                                  \
     if constexpr (are_tile_shapes_supported<Arch, TileShape, ClusterShape, T, WeightType>()) {     \
-      dispatchMoeGemmSelectBiasTmaWarpSpecialized<Arch, T, WeightType, OutputType, EpilogueTag,    \
-                                                  FUSION, TileShape, ClusterShape>(                \
-          hopper_input, num_experts, multi_processor_count, stream, occupancy, workspace_size);    \
+      dispatchMoeGemmFinalDispatchTmaWarpSpecialized<Arch, T, WeightType, OutputType, EpilogueTag, \
+                                                     FUSION, TileShape, ClusterShape>(             \
+          hopper_input, num_experts, gemm_config, multi_processor_count, stream, occupancy,        \
+          workspace_size);                                                                         \
       break;                                                                                       \
     } else {                                                                                       \
       TLLM_THROW(                                                                                  \
@@ -275,7 +375,8 @@ void dispatchMoeGemmSelectClusterShapeTmaWarpSpecialized(
 
 #undef SHAPE_CASE
     default:
-      TLLM_THROW("Unsupported config %d for MoE gemm.", (int)gemm_config.cluster_shape);
+      TLLM_THROW("Unsupported cluster shape config %d for MoE gemm.",
+                 (int)gemm_config.cluster_shape);
   }
 }  // namespace tensorrt_llm
 
@@ -301,15 +402,16 @@ void dispatchMoeGemmSelectTileShapeTmaWarpSpecialized(
         workspace_size);                                                                          \
     break;                                                                                        \
   }
-#define DEFAULT_CASE(SMVERSION)                                                                    \
-  case cutlass_extensions::CutlassTileConfigSM##SMVERSION::Undefined:                              \
-    TLLM_THROW("GEMM config undefined.");                                                          \
-    break;                                                                                         \
-  case cutlass_extensions::CutlassTileConfigSM##SMVERSION::ChooseWithHeuristic:                    \
-    TLLM_THROW("GEMM config should have already been set by heuristic.");                          \
-    break;                                                                                         \
-  default:                                                                                         \
-    TLLM_THROW("Unsupported config %d for MoE gemm.", (int)gemm_config.tile_config_sm##SMVERSION); \
+#define DEFAULT_CASE(SMVERSION)                                                 \
+  case cutlass_extensions::CutlassTileConfigSM##SMVERSION::Undefined:           \
+    TLLM_THROW("GEMM config undefined.");                                       \
+    break;                                                                      \
+  case cutlass_extensions::CutlassTileConfigSM##SMVERSION::ChooseWithHeuristic: \
+    TLLM_THROW("GEMM config should have already been set by heuristic.");       \
+    break;                                                                      \
+  default:                                                                      \
+    TLLM_THROW("Unsupported tile shape config %d for MoE gemm.",                \
+               (int)gemm_config.tile_config_sm##SMVERSION);                     \
     break;
 
   if (gemm_config.sm_version == 90) {
@@ -327,29 +429,29 @@ void dispatchMoeGemmSelectTileShapeTmaWarpSpecialized(
     } else {
       TLLM_THROW("Unsupported SM90 configuration requested");
     }
-  } else if (gemm_config.sm_version == 110) {
+  }
+#if defined(ENABLE_FP4) && defined(COMPILE_BLACKWELL_SM103_TMA_GROUPED_GEMMS)
+  // Check this before SM100 because we fall back to SM100 if not NVFP4
+  else if (gemm_config.sm_version == 103 && std::is_same_v<T, __nv_fp4_e2m1> &&
+           std::is_same_v<WeightType, __nv_fp4_e2m1>) {
     if constexpr (kernels::cutlass_kernels::isValidBlackwellMOESpecialisation<
                       T, WeightType, EpilogueTag, FUSION>()) {
       switch (gemm_config.tile_config_sm100) {
-        SHAPE_CASE(100, 64, 64, 128)
-        SHAPE_CASE(100, 64, 128, 128)
-        SHAPE_CASE(100, 64, 256, 128)
+        SHAPE_CASE(103, 128, 128, 128)
+        SHAPE_CASE(103, 128, 256, 128)
 
-        SHAPE_CASE(100, 128, 16, 128)
-        SHAPE_CASE(100, 128, 32, 128)
-        SHAPE_CASE(100, 128, 64, 128)
-        SHAPE_CASE(100, 128, 128, 128)
-        SHAPE_CASE(100, 128, 256, 128)
-
-        DEFAULT_CASE(100)
+        DEFAULT_CASE(100)  // 100 because we use the same member variable for SM100 and SM103
       }
     } else {
-      TLLM_THROW("Unsupported SM110 configuration requested");
+      TLLM_THROW("Unsupported SM103 configuration requested");
     }
-  } else if (gemm_config.sm_version >= 100 && gemm_config.sm_version < 110) {
+  }
+#endif
+  else if (gemm_config.sm_version >= 100 && gemm_config.sm_version < 120) {
     if constexpr (kernels::cutlass_kernels::isValidBlackwellMOESpecialisation<
                       T, WeightType, EpilogueTag, FUSION>()) {
       switch (gemm_config.tile_config_sm100) {
+        SHAPE_CASE(100, 64, 32, 128)
         SHAPE_CASE(100, 64, 64, 128)
         SHAPE_CASE(100, 64, 128, 128)
         SHAPE_CASE(100, 64, 256, 128)
@@ -360,13 +462,8 @@ void dispatchMoeGemmSelectTileShapeTmaWarpSpecialized(
         SHAPE_CASE(100, 128, 128, 128)
         SHAPE_CASE(100, 128, 256, 128)
 
-        SHAPE_CASE(100, 256, 64, 128)
-        SHAPE_CASE(100, 256, 128, 128)
-        SHAPE_CASE(100, 256, 256, 128)
-
         // SHAPE_CASE(100, 128, 128, 64)
         // SHAPE_CASE(100, 128, 256, 64)
-        // SHAPE_CASE(100, 256, 256, 64)
         DEFAULT_CASE(100)
       }
     } else {
@@ -404,4 +501,4 @@ size_t calcMaxWorkspaceSizeTmaWarpSpecialized(
   return count;
 }
 
-}  // namespace tensorrt_llm::kernels::cutlass_kernels
+}  // namespace tensorrt_llm::kernels::cutlass_kernels_oss
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
index 3375a60716..eaaedf4258 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
@@ -57,8 +57,10 @@
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h"
 
-namespace tensorrt_llm::kernels::cutlass_kernels {
+namespace tensorrt_llm::kernels::cutlass_kernels_oss {
 
+using tensorrt_llm::kernels::cutlass_kernels::GroupedGemmInput;
+using tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput;
 namespace tk = tensorrt_llm::common;
 namespace tkc = tensorrt_llm::cutlass_extensions;
 
@@ -69,6 +71,7 @@ template <typename T, typename WeightType, typename GemmOutputType, typename Epi
 void sm90_dispatch_mainloop_schedules(
     GroupedGemmInput<T, WeightType, GemmOutputType, GemmOutputType> inputs,
     TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
 #ifdef COMPILE_HOPPER_TMA_GROUPED_GEMMS
   switch (inputs.gemm_config.mainloop_schedule) {
     case tkc::MainloopScheduleType::COOPERATIVE:
@@ -120,6 +123,7 @@ template <typename T, typename WeightType, typename GemmOutputType, typename Epi
 void sm90_dispatch_moe_mixed_dtype_gemm_config(
     GroupedGemmInput<T, WeightType, GemmOutputType, GemmOutputType> inputs,
     TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   switch (inputs.gemm_config.cluster_shape) {
     case tkc::ClusterShape::ClusterShape_1x1x1:
       sm90_dispatch_mainloop_schedules<T, WeightType, GemmOutputType, EpilogueTag, CTAShape,
@@ -153,6 +157,7 @@ template <typename T, typename WeightType, typename GemmOutputType, typename Epi
 void sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(
     GroupedGemmInput<T, WeightType, GemmOutputType, GemmOutputType> inputs,
     TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   // We also only instantiate configs here where threadblockShapeM == warpShapeM since those usually
   // perform the best for mixed type gemms.
 
@@ -164,11 +169,12 @@ void sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(
 #else
   constexpr int Ntile = 128;
   constexpr int Ktile = 128 * PackedScalesNum / sizeof(T);
-  TLLM_CHECK(sizeof(T) == 2);
+  TLLM_CHECK(sizeof(T) == 1);
 #endif
 
   using _Ntile = Int<Ntile>;
   using _Ktile = Int<Ktile>;
+
   switch (inputs.gemm_config.tile_config_sm90) {
     case tkc::CutlassTileConfigSM90::CtaShape64x16x128B:
       sm90_dispatch_moe_mixed_dtype_gemm_config<T, WeightType, GemmOutputType, EpilogueTag,
@@ -246,7 +252,7 @@ void sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(
 template <typename T, typename WeightType, typename OutputType>
 size_t calcMaxWorkspaceSizeTmaWarpSpecializedMixedInput(int num_experts, int sm_count_) {
   size_t count = 0;
-#ifdef ENABLE_FP4
+#if defined(ENABLE_FP4)
   constexpr int Ktile = (std::is_same_v<WeightType, __nv_fp4_e2m1>) ? 256 : 512;
 #else
   constexpr int Ktile = 512;
@@ -267,4 +273,4 @@ size_t calcMaxWorkspaceSizeTmaWarpSpecializedMixedInput(int num_experts, int sm_
   return count;
 }
 
-}  // namespace tensorrt_llm::kernels::cutlass_kernels
+}  // namespace tensorrt_llm::kernels::cutlass_kernels_oss
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu
index f240680c6b..52cd03887b 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu
@@ -14,43 +14,45 @@
  * limitations under the License.
  */
 
+#include "../include/moe_gemm_kernels.h"
 #include "cute/tensor.hpp"
 #include "cutlass/conv/convolution.h"
 #include "cutlass/cutlass.h"
-#include "moe_gemm_kernels.h"
 // Order matters here, packed_stride.hpp is missing cute and convolution includes
 #include "cutlass/util/packed_stride.hpp"
 #include "tensorrt_llm/common/logger.h"
 
 namespace tensorrt_llm::kernels::cutlass_kernels {
-std::array<size_t, 17> TmaWarpSpecializedGroupedGemmInput::workspaceBuffers(
+std::array<size_t, 20> TmaWarpSpecializedGroupedGemmInput::workspaceBuffers(
     int num_experts, FpXBlockScalingType scaling_type) {
   size_t problem_shape_size = sizeof(ProblemShape::UnderlyingProblemShape) * num_experts;
-  size_t stride_a_size = sizeof(StrideA) * num_experts;
-  size_t stride_b_size = sizeof(StrideB) * num_experts;
-  size_t stride_c_size = sizeof(StrideC) * num_experts;
-  size_t stride_d_size = sizeof(DefaultEpilogue::StrideD) * num_experts;
+  size_t stride_act_size = std::max(sizeof(StrideA), sizeof(StrideB)) * num_experts;
+  size_t stride_weight_size = std::max(sizeof(StrideA), sizeof(StrideB)) * num_experts;
+  size_t stride_c_size = std::max(sizeof(StrideC), sizeof(StrideC_T)) * num_experts;
+  size_t stride_d_size = std::max(sizeof(StrideD), sizeof(StrideD_T)) * num_experts;
 
   size_t ptr_buf_size = sizeof(void*) * num_experts;
   size_t scale_buf_size = sizeof(float*) * num_experts;
 
-  size_t sf_a_size = sizeof(ElementSF*) * num_experts;
-  size_t sf_b_size = sizeof(ElementSF*) * num_experts;
-  size_t stride_sf_a_size = scaling_type == FpXBlockScalingType::MXFPX
-                                ? sizeof(MXFPXBlockScaledConfig::LayoutSF) * num_experts
-                                : sizeof(NVFP4BlockScaledConfig::LayoutSF) * num_experts;
-  size_t stride_sf_b_size = scaling_type == FpXBlockScalingType::MXFPX
-                                ? sizeof(MXFPXBlockScaledConfig::LayoutSF) * num_experts
-                                : sizeof(NVFP4BlockScaledConfig::LayoutSF) * num_experts;
+  size_t sf_act_size = sizeof(ElementSF*) * num_experts;
+  size_t sf_weight_size = sizeof(ElementSF*) * num_experts;
+  size_t stride_sf_act_size = scaling_type == FpXBlockScalingType::MXFPX
+                                  ? sizeof(MXFPXBlockScaledConfig::LayoutSF) * num_experts
+                                  : sizeof(NVFP4BlockScaledConfig::LayoutSF) * num_experts;
+  size_t stride_sf_weight_size = scaling_type == FpXBlockScalingType::MXFPX
+                                     ? sizeof(MXFPXBlockScaledConfig::LayoutSF) * num_experts
+                                     : sizeof(NVFP4BlockScaledConfig::LayoutSF) * num_experts;
 
   size_t int4_groupwise_problem_shape_size =
       sizeof(INT4GroupwiseParams::ProblemShapeInt::UnderlyingProblemShape) * num_experts;
   size_t int4_groupwise_sf_a_size = sizeof(INT4GroupwiseParams::SFA*) * num_experts;
   size_t int4_groupwise_stride_sf_a_size = sizeof(INT4GroupwiseParams::StrideSFA) * num_experts;
 
+  size_t ptr_token_map_size = sizeof(int**) * num_experts;
+
   return std::array{problem_shape_size,
-                    stride_a_size,
-                    stride_b_size,
+                    stride_act_size,
+                    stride_weight_size,
                     stride_c_size,
                     stride_d_size,
                     ptr_buf_size,
@@ -58,13 +60,16 @@ std::array<size_t, 17> TmaWarpSpecializedGroupedGemmInput::workspaceBuffers(
                     ptr_buf_size,
                     ptr_buf_size,
                     scale_buf_size,
-                    sf_a_size,
-                    sf_b_size,
-                    stride_sf_a_size,
-                    stride_sf_b_size,
+                    sf_act_size,
+                    sf_weight_size,
+                    stride_sf_act_size,
+                    stride_sf_weight_size,
                     int4_groupwise_problem_shape_size,
                     int4_groupwise_sf_a_size,
-                    int4_groupwise_stride_sf_a_size};
+                    int4_groupwise_stride_sf_a_size,
+                    ptr_buf_size,
+                    scale_buf_size,
+                    ptr_token_map_size};
 }
 
 size_t TmaWarpSpecializedGroupedGemmInput::workspaceSize(int num_experts,
@@ -78,7 +83,7 @@ void TmaWarpSpecializedGroupedGemmInput::configureWorkspace(int8_t* start_ptr, i
                                                             size_t gemm_workspace_size,
                                                             FpXBlockScalingType scaling_type) {
   auto buffers = workspaceBuffers(num_experts, scaling_type);
-  std::array<int8_t*, 17> pointers{};
+  std::array<int8_t*, 20> pointers{};
   TLLM_CHECK_WITH_INFO(pointers.size() == buffers.size(),
                        "Mismatching workspace size and number of buffers");
   for (int i = 0; i < buffers.size(); i++) {
@@ -89,23 +94,23 @@ void TmaWarpSpecializedGroupedGemmInput::configureWorkspace(int8_t* start_ptr, i
   shape_info.num_groups = num_experts;
   shape_info.problem_shapes = reinterpret_cast<ProblemShape::UnderlyingProblemShape*>(pointers[0]);
   shape_info.host_problem_shapes = nullptr;
-  stride_a = reinterpret_cast<StrideA*>(pointers[1]);
-  stride_b = reinterpret_cast<StrideB*>(pointers[2]);
-  stride_c = reinterpret_cast<StrideC*>(pointers[3]);
-  default_epilogue.stride_d = reinterpret_cast<DefaultEpilogue::StrideD*>(pointers[4]);
+  stride_act = reinterpret_cast<void*>(pointers[1]);
+  stride_weight = reinterpret_cast<void*>(pointers[2]);
+  stride_c = reinterpret_cast<void*>(pointers[3]);
+  stride_d = reinterpret_cast<void*>(pointers[4]);
 
-  ptr_a = reinterpret_cast<void const**>(pointers[5]);
-  ptr_b = reinterpret_cast<void const**>(pointers[6]);
+  ptr_act = reinterpret_cast<void const**>(pointers[5]);
+  ptr_weight = reinterpret_cast<void const**>(pointers[6]);
   ptr_c = reinterpret_cast<void const**>(pointers[7]);
-  default_epilogue.ptr_d = reinterpret_cast<void**>(pointers[8]);
+  ptr_d = reinterpret_cast<void**>(pointers[8]);
 
   alpha_scale_ptr_array = reinterpret_cast<float const**>(pointers[9]);
 
-  fpX_block_scaling_factors_A = reinterpret_cast<ElementSF const**>(pointers[10]);
-  fpX_block_scaling_factors_B = reinterpret_cast<ElementSF const**>(pointers[11]);
+  fpX_block_scaling_factors_act = reinterpret_cast<ElementSF const**>(pointers[10]);
+  fpX_block_scaling_factors_weight = reinterpret_cast<ElementSF const**>(pointers[11]);
 
-  fpX_block_scaling_factors_stride_A = pointers[12];
-  fpX_block_scaling_factors_stride_B = pointers[13];
+  fpX_block_scaling_factors_stride_act = pointers[12];
+  fpX_block_scaling_factors_stride_weight = pointers[13];
 
   int4_groupwise_params.shape.problem_shapes =
       reinterpret_cast<INT4GroupwiseParams::ProblemShapeInt::UnderlyingProblemShape*>(pointers[14]);
@@ -114,27 +119,30 @@ void TmaWarpSpecializedGroupedGemmInput::configureWorkspace(int8_t* start_ptr, i
   int4_groupwise_params.stride_s_a =
       reinterpret_cast<INT4GroupwiseParams::StrideSFA*>(pointers[16]);
 
+  fused_finalize_epilogue.ptr_bias = reinterpret_cast<void const**>(pointers[17]);
+  fused_finalize_epilogue.ptr_router_scales = reinterpret_cast<float const**>(pointers[18]);
+  fused_finalize_epilogue.ptr_source_token_index = reinterpret_cast<int const**>(pointers[19]);
+
   this->gemm_workspace = reinterpret_cast<uint8_t*>(gemm_workspace);
   this->gemm_workspace_size = gemm_workspace_size;
 }
 
-void TmaWarpSpecializedGroupedGemmInput::setFinalizeFusionParams(
-    void* final_output, float const* router_scales, int64_t const* expert_first_token_offset,
-    int const* source_token_index, void const* bias, int hidden_size, int num_output_tokens) {
+void TmaWarpSpecializedGroupedGemmInput::setFinalizeFusionParams(void* final_output,
+                                                                 int hidden_size,
+                                                                 int num_output_tokens,
+                                                                 bool use_reduction) {
   fused_finalize_epilogue.ptr_final_output = final_output;
-  fused_finalize_epilogue.ptr_router_scales = router_scales;
-  fused_finalize_epilogue.ptr_bias = bias;
-  fused_finalize_epilogue.ptr_expert_first_token_offset = expert_first_token_offset;
-  fused_finalize_epilogue.ptr_source_token_index = source_token_index;
-
-  fused_finalize_epilogue.stride_final_output = cutlass::make_cute_packed_stride(
-      FusedFinalizeEpilogue::StrideFinalOutput{},
-      transpose_stride(cute::make_shape(num_output_tokens, hidden_size, 1)));
-  fused_finalize_epilogue.stride_bias =
-      transpose_stride(cute::make_stride(cute::Int<0>{}, cute::Int<1>{}, hidden_size));
-  fused_finalize_epilogue.stride_router_scales = {};
+
+  fused_finalize_epilogue.stride_final_output =
+      cutlass::make_cute_packed_stride(FusedFinalizeEpilogue::StrideFinalOutput{},
+                                       cute::make_shape(num_output_tokens, hidden_size, 1));
+  fused_finalize_epilogue.stride_final_output_transposed =
+      cutlass::make_cute_packed_stride(FusedFinalizeEpilogue::StrideFinalOutput_T{},
+                                       cute::make_shape(hidden_size, num_output_tokens, 1));
 
   fused_finalize_epilogue.num_rows_in_final_output = num_output_tokens;
+  fused_finalize_epilogue.shape_override = hidden_size;
+  fused_finalize_epilogue.use_reduction = use_reduction;
 }
 
 std::string TmaWarpSpecializedGroupedGemmInput::toString() const {
@@ -142,32 +150,29 @@ std::string TmaWarpSpecializedGroupedGemmInput::toString() const {
   ss << "Hopper Input Information: " << (isValid() ? "valid" : "null") << "\n";
   if (isValid()) {
     using PrintType = void const*;
-    ss << "Ptr A: " << (PrintType)ptr_a << " with Stride: " << (PrintType)stride_a << ",\n"
-       << "Ptr B: " << (PrintType)ptr_b << " with Stride: " << (PrintType)stride_b << ",\n"
+    ss << "Ptr Act: " << (PrintType)ptr_act << " with Stride: " << (PrintType)stride_act << ",\n"
+       << "Ptr Weight: " << (PrintType)ptr_weight << " with Stride: " << (PrintType)stride_weight
+       << ",\n"
        << "Ptr C: " << (PrintType)ptr_c << " with Stride: " << (PrintType)stride_c << "\n";
     ss << "Epilogue Fusion: " << (int)fusion << ",\n";
     if (fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE) {
       ss << "Final Output: " << (PrintType)fused_finalize_epilogue.ptr_final_output;
       ss << " with Stride: " << fused_finalize_epilogue.stride_final_output;
       ss << ",\nBias: " << (PrintType)fused_finalize_epilogue.ptr_bias;
-      ss << " with Stride: " << fused_finalize_epilogue.stride_bias;
       ss << ",\nRouter Scales: " << fused_finalize_epilogue.ptr_router_scales;
-      ss << " with Stride: " << fused_finalize_epilogue.stride_router_scales;
-      ss << ",\nExpert Offset: "
-         << (PrintType)fused_finalize_epilogue.ptr_expert_first_token_offset;
       ss << ", Source Map: " << (PrintType)fused_finalize_epilogue.ptr_source_token_index;
     } else {
-      ss << "Ptr D: " << (PrintType)default_epilogue.ptr_d;
-      ss << " with Stride: " << (PrintType)default_epilogue.stride_d;
+      ss << "Ptr D: " << (PrintType)ptr_d;
+      ss << " with Stride: " << (PrintType)stride_d;
     }
     ss << '\n';
     ss << "Alpha scale ptr: " << (PrintType)alpha_scale_ptr_array << "\n";
 
     ss << "FpX Block Scaling Type: " << (int)fpX_block_scaling_type << "\n";
-    ss << "Fp4 Block Scaling Factors A: " << (PrintType)fpX_block_scaling_factors_A
-       << ", with Stride: " << (PrintType)fpX_block_scaling_factors_stride_A << "\n";
-    ss << "Fp4 Block Scaling Factors B: " << (PrintType)fpX_block_scaling_factors_B
-       << ", with Stride: " << (PrintType)fpX_block_scaling_factors_stride_B << "\n";
+    ss << "Fp4 Block Scaling Factors Act: " << (PrintType)fpX_block_scaling_factors_act
+       << ", with Stride: " << (PrintType)fpX_block_scaling_factors_stride_act << "\n";
+    ss << "Fp4 Block Scaling Factors Weight: " << (PrintType)fpX_block_scaling_factors_weight
+       << ", with Stride: " << (PrintType)fpX_block_scaling_factors_stride_weight << "\n";
     ss << "Gemm Workspace: " << (PrintType)gemm_workspace << ", with Size: " << gemm_workspace_size
        << "\n";
   }
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h
index d0bcbb978d..fb9ae80f2f 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
+#include "../include/moe_gemm_kernels.h"
 #include "cutlass/arch/mma_sm90.h"
 #include "cutlass_extensions/epilogue_helpers.h"
-#include "moe_gemm_kernels.h"
 
 #ifdef ENABLE_FP4
 #include <cuda_fp4.h>
@@ -32,12 +32,16 @@ template <typename T, typename WeightType,
           TmaWarpSpecializedGroupedGemmInput::EpilogueFusion Fusion =
               TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE>
 constexpr bool isValidSM120MOESpecialisation() {
-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) && \
-    defined(ENABLE_FP4)  // TODO Is there a better choice
-  return cutlass::platform::is_same<T, __nv_fp4_e2m1>::value &&
-         cutlass::platform::is_same<T, WeightType>::value &&
-         cutlass::platform::is_same<EpilogueTag, cutlass_extensions::EpilogueOpDefault>::value &&
-         Fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE;
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)  // TODO Is there a better choice
+#if defined(ENABLE_FP4)
+  return ((cutlass::platform::is_same<T, __nv_fp4_e2m1>::value &&
+           cutlass::platform::is_same<T, WeightType>::value) ||
+          (cutlass::platform::is_same<T, __nv_fp8_e4m3>::value &&
+           cutlass::platform::is_same<WeightType, __nv_fp4_e2m1>::value)) &&
+         cutlass::platform::is_same<EpilogueTag, cutlass_extensions::EpilogueOpDefault>::value;
+#else
+  return false;
+#endif
 #else
   return false;  // CUTLASS_ARCH_MMA_SM100_SUPPORTED is set when Blackwell kernels are enabled
 #endif
@@ -49,6 +53,7 @@ template <typename T, typename WeightType,
               TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE>
 constexpr bool isValidBlackwellMOESpecialisation() {
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)  // TODO Is there a better choice
+#if defined(ENABLE_FP4)
   return (cutlass::platform::is_same<T, WeightType>::value ||
 #if defined(ENABLE_FP4)
           (cutlass::platform::is_same<T, __nv_fp8_e4m3>::value &&
@@ -57,8 +62,11 @@ constexpr bool isValidBlackwellMOESpecialisation() {
           false
 #endif
               ) &&
-         cutlass::platform::is_same<EpilogueTag, cutlass_extensions::EpilogueOpDefault>::value &&
-         Fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE;
+         cutlass::platform::is_same<EpilogueTag, cutlass_extensions::EpilogueOpDefault>::value;
+#else
+  return cutlass::platform::is_same<T, WeightType>::value &&
+         cutlass::platform::is_same<EpilogueTag, cutlass_extensions::EpilogueOpDefault>::value;
+#endif
 #else
   return false;  // CUTLASS_ARCH_MMA_SM100_SUPPORTED is set when Blackwell kernels are enabled
 #endif
@@ -73,15 +81,12 @@ constexpr bool isValidHopperMOESpecialisation() {
 #if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
   return (cutlass::platform::is_same<T, WeightType>::value ||
           (cutlass::platform::is_same<cutlass::uint4b_t, WeightType>::value &&
-           cutlass::platform::is_same<T, __nv_fp8_e4m3>::value) ||
+           cutlass::platform::is_same<T, __nv_fp8_e4m3>::value)
 #ifdef ENABLE_FP4
-          (cutlass::platform::is_same<__nv_fp4_e2m1, WeightType>::value &&
-           !cutlass::platform::is_same<T, __nv_fp8_e4m3>::value)
-#else
-          false
+          || (cutlass::platform::is_same<__nv_fp4_e2m1, WeightType>::value &&
+              !cutlass::platform::is_same<T, __nv_fp8_e4m3>::value)
 #endif
               )
-
 #ifdef ENABLE_FP4
          && !cutlass::platform::is_same<T, __nv_fp4_e2m1>::value
 #endif
@@ -98,7 +103,8 @@ template <typename T, typename WeightType,
               TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE>
 constexpr bool isValidTmaWarpSpecializedMOESpecialisation() {
   // Check at least one of the implementations are valid
-  return isValidBlackwellMOESpecialisation<T, WeightType, EpilogueTag, Fusion>() ||
+  return isValidSM120MOESpecialisation<T, WeightType>() ||
+         isValidBlackwellMOESpecialisation<T, WeightType, EpilogueTag, Fusion>() ||
          isValidHopperMOESpecialisation<T, WeightType, EpilogueTag, Fusion>();
 }
 
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh b/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh
index 237b59eeaf..7abf2eb631 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh
+++ b/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh
@@ -778,56 +778,86 @@ quantize_with_block_size(
   int numColThreadsForSf = numColsForSf / ELTS_PER_THREAD;
 
   asm volatile("griddepcontrol.wait;");
+
   // Input tensor batch/row/col loops.
+  // Optimization: Iterate over actual rows first (hot path), then padding rows (cold path)
+  // This improves performance for small batch sizes with swizzled layout
   for (int rowIdx = blockIdx.x; rowIdx < numPaddedRowsForSf; rowIdx += gridDim.x) {
-    for (int batchIdx = 0; batchIdx < numbatches; batchIdx++) {
-      for (int colIdx = threadIdx.x; colIdx < numColThreadsForSf; colIdx += blockDim.x) {
-        std::optional<int> optionalBatchIdx = batchIdx;
-        std::optional<int> optionalNumRows = numRows;
-
-        // The SF output pointer.
-        auto sf_out = cvt_quant_get_sf_out_offset<uint32_t, CVT_NUM_THREADS_PER_SF>(
-            optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numPaddedCols / SF_VEC_SIZE, SFout,
-            layout);
-
-        // The input tensor offset.
-        int64_t inOffset =
-            static_cast<int64_t>(batchIdx * numRows + rowIdx) * numColThreads + colIdx;
-        int64_t outOffset =
-            static_cast<int64_t>(batchIdx * numRows + rowIdx) * numPaddedColThreads + colIdx;
-
-        // Set the values to 0 of those are padded columns.
-        if (rowIdx < numRows && colIdx >= numColThreads && colIdx < numPaddedColThreads) {
-          // Dispatch the quantization kernel.
-          if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4) {
-            reinterpret_cast<uint32_t*>(out)[outOffset] = 0u;
-          } else if constexpr (quantization_type == BlockScaleQuantizationType::FP8_TO_FP4 ||
-                               quantization_type == BlockScaleQuantizationType::FP16_TO_MXFP8) {
-            reinterpret_cast<uint64_t*>(out)[outOffset] = 0ull;
-          }
-        }
+    // Early exit for padding-only blocks: if this block only processes padding rows,
+    // we can skip the batch loop and just zero out the scale factors
+    bool isRowPadding = (rowIdx >= numRows);
+
+    if (isRowPadding) {
+      // Fast path: This row is entirely padding, only zero out scale factors.
+      // Note: Padding rows do NOT exist in the output tensor (which is sized [numRows, K]),
+      // they only exist in the swizzled scale factor layout. Do NOT write to output buffer here.
+      for (int batchIdx = 0; batchIdx < numbatches; batchIdx++) {
+        for (int colIdx = threadIdx.x; colIdx < numColThreadsForSf; colIdx += blockDim.x) {
+          std::optional<int> optionalBatchIdx = batchIdx;
+          std::optional<int> optionalNumRows = numRows;
+
+          // The SF output pointer.
+          auto sf_out = cvt_quant_get_sf_out_offset<uint32_t, CVT_NUM_THREADS_PER_SF>(
+              optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numColsForSf / SF_VEC_SIZE, SFout,
+              layout);
 
-        // Set the SF padding to 0.
-        if (rowIdx >= numRows || colIdx >= numColThreads) {
           // Set the SF padding to 0.
           if (sf_out != nullptr) {
             sf_out[0] = 0x00;
           }
-        } else {
-          // Load the input vector.
-          PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-
-          // Dispatch the quantization kernel.
-          if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4) {
-            reinterpret_cast<uint32_t*>(out)[outOffset] =
-                cvt_warp_fp16_to_fp4<Type, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
-          } else if constexpr (quantization_type == BlockScaleQuantizationType::FP8_TO_FP4) {
-            reinterpret_cast<uint64_t*>(out)[outOffset] =
-                cvt_warp_fp8_to_fp4<__nv_fp8_e4m3, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal,
-                                                                          sf_out);
-          } else if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_MXFP8) {
-            reinterpret_cast<uint64_t*>(out)[outOffset] =
-                cvt_warp_fp16_to_mxfp8<Type, SF_VEC_SIZE>(in_vec, sf_out);
+        }
+      }
+    } else {
+      // Normal path: This row contains actual data
+      for (int batchIdx = 0; batchIdx < numbatches; batchIdx++) {
+        for (int colIdx = threadIdx.x; colIdx < numColThreadsForSf; colIdx += blockDim.x) {
+          std::optional<int> optionalBatchIdx = batchIdx;
+          std::optional<int> optionalNumRows = numRows;
+
+          // The SF output pointer.
+          auto sf_out = cvt_quant_get_sf_out_offset<uint32_t, CVT_NUM_THREADS_PER_SF>(
+              optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numColsForSf / SF_VEC_SIZE, SFout,
+              layout);
+
+          // The input tensor offset.
+          int64_t inOffset =
+              static_cast<int64_t>(batchIdx * numRows + rowIdx) * numColThreads + colIdx;
+          int64_t outOffset =
+              static_cast<int64_t>(batchIdx * numRows + rowIdx) * numPaddedColThreads + colIdx;
+
+          // Set the values to 0 of those are padded columns.
+          if (colIdx >= numColThreads && colIdx < numPaddedColThreads) {
+            // Dispatch the quantization kernel.
+            if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4) {
+              reinterpret_cast<uint32_t*>(out)[outOffset] = 0u;
+            } else if constexpr (quantization_type == BlockScaleQuantizationType::FP8_TO_FP4 ||
+                                 quantization_type == BlockScaleQuantizationType::FP16_TO_MXFP8) {
+              reinterpret_cast<uint64_t*>(out)[outOffset] = 0ull;
+            }
+          }
+
+          // Process actual data or padding
+          if (colIdx >= numColThreads) {
+            // Column padding: Set the SF padding to 0.
+            if (sf_out != nullptr) {
+              sf_out[0] = 0x00;
+            }
+          } else {
+            // Load the input vector.
+            PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+
+            // Dispatch the quantization kernel.
+            if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4) {
+              reinterpret_cast<uint32_t*>(out)[outOffset] =
+                  cvt_warp_fp16_to_fp4<Type, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+            } else if constexpr (quantization_type == BlockScaleQuantizationType::FP8_TO_FP4) {
+              reinterpret_cast<uint64_t*>(out)[outOffset] =
+                  cvt_warp_fp8_to_fp4<__nv_fp8_e4m3, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal,
+                                                                            sf_out);
+            } else if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_MXFP8) {
+              reinterpret_cast<uint64_t*>(out)[outOffset] =
+                  cvt_warp_fp16_to_mxfp8<Type, SF_VEC_SIZE>(in_vec, sf_out);
+            }
           }
         }
       }
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/quantization.h b/csrc/nv_internal/tensorrt_llm/kernels/quantization.h
index 8a90578cb4..9809d20a5b 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/quantization.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/quantization.h
@@ -67,9 +67,9 @@ void invokeSiluAndMulNVFP4Quantization(void* output, void* output_scale, void* i
                                        void* input_global_scale, void* mask, bool use_silu_and_mul,
                                        int m_topk, int k, int n_experts, cudaStream_t stream);
 
-void invokeBlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded,
-                                uint8_t const* SFIn, uint8_t* SFOutput, int multiProcessorCount,
-                                cudaStream_t stream = 0);
+template <typename T>
+void invokeBlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded, T const* SFIn,
+                                T* SFOutput, int multiProcessorCount, cudaStream_t stream = 0);
 
 void invokeBlockScaleInterleaveReverse(int b, int m, int n, uint8_t const* SFIn, uint8_t* SFOutput,
                                        int multiProcessorCount, cudaStream_t stream = 0);
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
index 673bc27edd..a43ee01a5a 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
@@ -137,6 +137,41 @@ int computeSFIndex(int rowIdx, int colIdx, int totalRow, int totalColumn,
   }
 }
 
+template <typename T>
+void blockScaleInterleaveHost(TensorView blockScale, TensorView interleavedBlockScale) {
+  auto blockScaleShape = blockScale.sizes();
+  auto num_experts = blockScaleShape.size() == 3 ? blockScaleShape[0] : 1;
+  auto rows = blockScaleShape.size() == 3 ? blockScaleShape[1] : blockScaleShape[0];
+  auto cols = blockScaleShape.size() == 3 ? blockScaleShape[2] : blockScaleShape[1];
+
+  auto expert_out_size = tensorrt_llm::computeSwizzledLayoutSFSize(rows, cols);
+  auto rows_padded = PadUpFn(rows, 128);
+  auto cols_padded = PadUpFn(cols, 4);
+
+  for (int eIdx = 0; eIdx < static_cast<int>(num_experts); eIdx++) {
+    T* interleavedBlockScalePtr =
+        static_cast<T*>(interleavedBlockScale.data_ptr()) + eIdx * expert_out_size;
+    for (int rIdx = 0; rIdx < static_cast<int>(rows_padded); ++rIdx) {
+      auto globalRowIdx = eIdx * rows + rIdx;
+      T* blockScalePtr = static_cast<T*>(blockScale.data_ptr()) + globalRowIdx * cols;
+      for (int cIdx = 0; cIdx < static_cast<int>(cols_padded); ++cIdx) {
+        T sf_ori = 0;
+        if (rIdx < static_cast<int>(rows) && cIdx < static_cast<int>(cols)) {
+          sf_ori = blockScalePtr[cIdx];
+        }
+        int sf_index = computeSFIndex(rIdx, cIdx, rows, cols,
+                                      tensorrt_llm::QuantizationSFLayout::SWIZZLED_128x4);
+        interleavedBlockScalePtr[sf_index] = sf_ori;
+      }
+    }
+  }
+}
+
+template void blockScaleInterleaveHost<uint8_t>(TensorView blockScale,
+                                                TensorView interleavedBlockScale);
+template void blockScaleInterleaveHost<__nv_bfloat16>(TensorView blockScale,
+                                                      TensorView interleavedBlockScale);
+
 // Interleave (and possibly pad) the weights block scaling factor.
 // blockScale: [num_experts, rows, cols] or [rows, cols]
 // Return: num_experts * pad_up(rows, 128) * pad_up(cols, 4)
@@ -148,7 +183,8 @@ void BlockScaleInterleave(TensorView blockScale, TensorView interleavedBlockScal
     CHECK_CPU(blockScale);
   }
   CHECK_CONTIGUOUS(blockScale);
-  CHECK_INPUT_TYPE(blockScale, dl_uint8);
+  TVM_FFI_ICHECK(blockScale.dtype() == dl_uint8 || blockScale.dtype() == dl_bfloat16)
+      << "Block Scale must be uint8 or bfloat16.";
   auto blockScaleShape = blockScale.sizes();
   TVM_FFI_ICHECK(blockScaleShape.size() == 2 || blockScaleShape.size() == 3)
       << "Block Scale should be 2D or 3D tensor.";
@@ -166,27 +202,28 @@ void BlockScaleInterleave(TensorView blockScale, TensorView interleavedBlockScal
     const thread_local int smCount = tensorrt_llm::common::getMultiProcessorCount();
     const cudaStream_t stream = get_stream(blockScale.device());
 
-    tensorrt_llm::kernels::invokeBlockScaleInterleave(
-        num_experts, rows, rows_padded, cols, cols_padded,
-        static_cast<uint8_t*>(blockScale.data_ptr()),
-        static_cast<uint8_t*>(interleavedBlockScale.data_ptr()), smCount, stream);
+    if (blockScale.dtype() == dl_uint8) {
+      tensorrt_llm::kernels::invokeBlockScaleInterleave(
+          num_experts, rows, rows_padded, cols, cols_padded,
+          static_cast<uint8_t*>(blockScale.data_ptr()),
+          static_cast<uint8_t*>(interleavedBlockScale.data_ptr()), smCount, stream);
+    } else if (blockScale.dtype() == dl_bfloat16) {
+      tensorrt_llm::kernels::invokeBlockScaleInterleave(
+          num_experts, rows, rows_padded, cols, cols_padded,
+          static_cast<__nv_bfloat16*>(blockScale.data_ptr()),
+          static_cast<__nv_bfloat16*>(interleavedBlockScale.data_ptr()), smCount, stream);
+    } else {
+      TVM_FFI_LOG_AND_THROW(NotImplementedError)
+          << "block_scale_interleave only supports uint8 and bfloat16.";
+    }
   } else {
-    for (int eIdx = 0; eIdx < static_cast<int>(num_experts); eIdx++) {
-      uint8_t* interleavedBlockScalePtr =
-          static_cast<uint8_t*>(interleavedBlockScale.data_ptr()) + eIdx * expert_out_size;
-      for (int rIdx = 0; rIdx < static_cast<int>(rows_padded); ++rIdx) {
-        auto globalRowIdx = eIdx * rows + rIdx;
-        uint8_t* blockScalePtr = static_cast<uint8_t*>(blockScale.data_ptr()) + globalRowIdx * cols;
-        for (int cIdx = 0; cIdx < static_cast<int>(cols_padded); ++cIdx) {
-          uint8_t sf_ori = 0;
-          if (rIdx < static_cast<int>(rows) && cIdx < static_cast<int>(cols)) {
-            sf_ori = blockScalePtr[cIdx];
-          }
-          int sf_index = computeSFIndex(rIdx, cIdx, rows, cols,
-                                        tensorrt_llm::QuantizationSFLayout::SWIZZLED_128x4);
-          interleavedBlockScalePtr[sf_index] = sf_ori;
-        }
-      }
+    if (blockScale.dtype() == dl_uint8) {
+      blockScaleInterleaveHost<uint8_t>(blockScale, interleavedBlockScale);
+    } else if (blockScale.dtype() == dl_bfloat16) {
+      blockScaleInterleaveHost<__nv_bfloat16>(blockScale, interleavedBlockScale);
+    } else {
+      TVM_FFI_LOG_AND_THROW(NotImplementedError)
+          << "blockScaleInterleaveHost only supports uint8 and bfloat16.";
     }
   }
 }
diff --git a/csrc/page.cu b/csrc/page.cu
index 614fc96640..0b33b30593 100644
--- a/csrc/page.cu
+++ b/csrc/page.cu
@@ -88,7 +88,7 @@ void append_paged_kv_cache(TensorView append_key, TensorView append_value, Tenso
   TVM_FFI_ICHECK_EQ(append_value.size(1), num_heads);
   TVM_FFI_ICHECK_EQ(append_value.size(2), head_dim);
 
-  cudaSetDevice(append_key.device().device_id);
+  ffi::CUDADeviceGuard device_guard(append_key.device().device_id);
   const cudaStream_t stream = get_stream(append_key.device());
   bool success = DISPATCH_DLPACK_DTYPE_TO_CTYPE(paged_k_cache.dtype(), c_type, [&] {
     paged_kv_t<c_type, int32_t> paged_kv(
@@ -112,31 +112,6 @@ void append_paged_kv_cache(TensorView append_key, TensorView append_value, Tenso
                           << paged_k_cache.dtype();
 }
 
-void block_sparse_indices_to_vector_sparse_offsets(
-    TensorView block_sparse_indices, TensorView block_sparse_indptr,
-    TensorView vector_sparse_offsets, TensorView vector_sparse_indptr, TensorView kv_len_arr,
-    int64_t stride_block, int64_t stride_n, int64_t batch_size, int64_t block_size) {
-  CHECK_INPUT(block_sparse_indices);
-  CHECK_INPUT(block_sparse_indptr);
-  CHECK_INPUT(vector_sparse_offsets);
-  CHECK_INPUT(vector_sparse_indptr);
-  CHECK_INPUT(kv_len_arr);
-
-  cudaSetDevice(block_sparse_indices.device().device_id);
-  const cudaStream_t stream = get_stream(block_sparse_indices.device());
-
-  cudaError_t status = BlockSparseIndicesToVectorSparseOffset(
-      static_cast<int32_t*>(block_sparse_indices.data_ptr()),
-      static_cast<int32_t*>(block_sparse_indptr.data_ptr()),
-      static_cast<int32_t*>(vector_sparse_offsets.data_ptr()),
-      static_cast<int32_t*>(vector_sparse_indptr.data_ptr()),
-      static_cast<int32_t*>(kv_len_arr.data_ptr()), stride_block, stride_n, batch_size, block_size,
-      stream);
-
-  TVM_FFI_ICHECK(status == cudaSuccess)
-      << "BlockSparseIndicesToVectorSparseOffset failed with error: " << cudaGetErrorString(status);
-}
-
 void append_paged_mla_kv_cache(TensorView append_ckv, TensorView append_kpe,
                                TensorView batch_indices, TensorView positions, TensorView ckv_cache,
                                TensorView kpe_cache, TensorView kv_indices, TensorView kv_indptr,
@@ -190,7 +165,7 @@ void append_paged_mla_kv_cache(TensorView append_ckv, TensorView append_kpe,
   TVM_FFI_ICHECK_EQ(append_ckv.size(1), ckv_dim);
   TVM_FFI_ICHECK_EQ(append_kpe.size(1), kpe_dim);
 
-  cudaSetDevice(append_ckv.device().device_id);
+  ffi::CUDADeviceGuard device_guard(append_ckv.device().device_id);
   const cudaStream_t stream = get_stream(append_ckv.device());
   bool success = DISPATCH_DLPACK_DTYPE_TO_CTYPE(ckv_cache.dtype(), c_type, [&] {
     paged_kv_mla_t<c_type, int32_t> paged_mla_kv(
diff --git a/csrc/pod.cu b/csrc/pod.cu
index e38796e0e3..c2a271b096 100644
--- a/csrc/pod.cu
+++ b/csrc/pod.cu
@@ -133,7 +133,7 @@ void pod_with_kv_cache_tensor(
   }
   kv_cache_strides_d = k_strides_d.data();
 
-  cudaSetDevice(float_workspace_buffer_d.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer_d.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer_d.device());
 
   DISPATCH_context(
diff --git a/csrc/pod_jit_binding.cu b/csrc/pod_jit_binding.cu
index 915e4bcdbf..1da0bf7bae 100644
--- a/csrc/pod_jit_binding.cu
+++ b/csrc/pod_jit_binding.cu
@@ -37,5 +37,5 @@ void pod_with_kv_cache_tensor(
     double logits_soft_cap_d, double sm_scale_d, double rope_rcp_scale_d, double rope_rcp_theta_d,
     bool enable_pdl);
 
-// Batch-request prefill attention with KV-Cache operator
+// Single prefill, Batch-request decode attention with KV-Cache operator
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(pod_with_kv_cache_tensor, pod_with_kv_cache_tensor);
diff --git a/csrc/renorm.cu b/csrc/renorm.cu
index 1e2aa45769..5e140a36a9 100644
--- a/csrc/renorm.cu
+++ b/csrc/renorm.cu
@@ -15,6 +15,7 @@
  */
 #include <flashinfer/sampling.cuh>
 
+#include "sampling_utils.h"
 #include "tvm_ffi_utils.h"
 
 using namespace flashinfer;
@@ -27,9 +28,10 @@ void top_p_renorm_probs(TensorView probs, TensorView renorm_probs,
   CHECK_DIM(2, probs);  // probs: (batch_size, vocab_size)
   unsigned int batch_size = probs.size(0);
   unsigned int vocab_size = probs.size(1);
+  check_tensor_param(maybe_top_p_arr, probs);
   bool has_top_p_arr = maybe_top_p_arr.has_value();
 
-  cudaSetDevice(probs.device().device_id);
+  ffi::CUDADeviceGuard device_guard(probs.device().device_id);
   auto stream = get_stream(probs.device());
   cudaError_t status = sampling::TopPRenormProb<float>(
       static_cast<float*>(probs.data_ptr()), static_cast<float*>(renorm_probs.data_ptr()),
@@ -45,9 +47,10 @@ void top_k_renorm_probs(TensorView probs, TensorView renorm_probs,
   CHECK_DIM(2, probs);  // probs: (batch_size, vocab_size)
   unsigned int batch_size = probs.size(0);
   unsigned int vocab_size = probs.size(1);
+  check_tensor_param(maybe_top_k_arr, probs);
   bool has_top_k_arr = maybe_top_k_arr.has_value();
 
-  cudaSetDevice(probs.device().device_id);
+  ffi::CUDADeviceGuard device_guard(probs.device().device_id);
   auto stream = get_stream(probs.device());
   cudaError_t status = sampling::TopKRenormProb<float>(
       static_cast<float*>(probs.data_ptr()), static_cast<float*>(renorm_probs.data_ptr()),
@@ -64,9 +67,10 @@ void top_k_mask_logits(TensorView logits, TensorView mask_logits,
   CHECK_DIM(2, logits);  // logits: (batch_size, vocab_size)
   unsigned int batch_size = logits.size(0);
   unsigned int vocab_size = logits.size(1);
+  check_tensor_param(maybe_top_k_arr, logits);
   bool has_top_k_arr = maybe_top_k_arr.has_value();
 
-  cudaSetDevice(logits.device().device_id);
+  ffi::CUDADeviceGuard device_guard(logits.device().device_id);
   auto stream = get_stream(logits.device());
   cudaError_t status = sampling::TopKMaskLogits<float>(
       static_cast<float*>(logits.data_ptr()), static_cast<float*>(mask_logits.data_ptr()),
diff --git a/csrc/rope.cu b/csrc/rope.cu
index 78cdcad405..14008bcaea 100644
--- a/csrc/rope.cu
+++ b/csrc/rope.cu
@@ -51,7 +51,7 @@ void apply_rope(TensorView q, TensorView k, TensorView q_rope, TensorView k_rope
   size_t k_rope_stride_h = k_rope.stride(1);
   TVM_FFI_ICHECK_EQ(indptr.dtype(), offsets.dtype());
 
-  cudaSetDevice(q.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q.device().device_id);
   const cudaStream_t stream = get_stream(q.device());
   DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(q.dtype(), c_type, [&] {
     return DISPATCH_DLPACK_IDTYPE_TO_CTYPE(indptr.dtype(), c_idtype, [&] {
@@ -94,7 +94,7 @@ void apply_rope_pos_ids(TensorView q, TensorView k, TensorView q_rope, TensorVie
   size_t k_rope_stride_n = k_rope.stride(0);
   size_t k_rope_stride_h = k_rope.stride(1);
 
-  cudaSetDevice(q.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q.device().device_id);
   const cudaStream_t stream = get_stream(q.device());
   DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(q.dtype(), c_type, [&] {
     return DISPATCH_DLPACK_IDTYPE_TO_CTYPE(pos_ids.dtype(), c_idtype, [&] {
@@ -144,7 +144,7 @@ void apply_rope_pos_ids_cos_sin_cache(TensorView q, TensorView k, TensorView q_r
   size_t k_rope_stride_n = k_rope.stride(0);
   size_t k_rope_stride_h = k_rope.stride(1);
 
-  cudaSetDevice(q.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q.device().device_id);
   const cudaStream_t stream = get_stream(q.device());
   DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(q.dtype(), c_type, [&] {
     return DISPATCH_DLPACK_IDTYPE_TO_CTYPE(pos_ids.dtype(), c_idtype, [&] {
@@ -196,7 +196,7 @@ void apply_llama31_rope(TensorView q, TensorView k, TensorView q_rope, TensorVie
   size_t k_rope_stride_h = k_rope.stride(1);
   TVM_FFI_ICHECK_EQ(indptr.dtype(), offsets.dtype());
 
-  cudaSetDevice(q.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q.device().device_id);
   const cudaStream_t stream = get_stream(q.device());
   DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(q.dtype(), c_type, [&] {
     return DISPATCH_DLPACK_IDTYPE_TO_CTYPE(indptr.dtype(), c_idtype, [&] {
@@ -242,7 +242,7 @@ void apply_llama31_rope_pos_ids(TensorView q, TensorView k, TensorView q_rope, T
   size_t k_rope_stride_n = k_rope.stride(0);
   size_t k_rope_stride_h = k_rope.stride(1);
 
-  cudaSetDevice(q.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q.device().device_id);
   const cudaStream_t stream = get_stream(q.device());
   DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(q.dtype(), c_type, [&] {
     return DISPATCH_DLPACK_IDTYPE_TO_CTYPE(pos_ids.dtype(), c_idtype, [&] {
@@ -393,7 +393,7 @@ void rope_quantize(TensorView q_rope_in, TensorView k_rope_in, TensorView q_nope
     k_nope_out_stride_h = k_nope_out.stride(1);
   }
 
-  cudaSetDevice(q_rope_in.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q_rope_in.device().device_id);
   const cudaStream_t stream = get_stream(q_rope_in.device());
   DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(q_rope_in.dtype(), c_type, [&] {
     return DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP8(q_rope_out.dtype(), c_quant_type, [&] {
@@ -420,3 +420,198 @@ void rope_quantize(TensorView q_rope_in, TensorView k_rope_in, TensorView q_nope
     });
   });
 }
+
+/*!
+ * TVM FFI binding for fused RoPE + quantization + paged KV cache append kernel
+ *
+ * Validates tensor shapes, dimensions, and data types, then dispatches to the templated
+ * RopeQuantizeAppendPagedKVCache CUDA kernel implementation.
+ */
+void rope_quantize_append_paged_kv_cache(
+    TensorView q_rope_in, TensorView k_rope_in, TensorView q_nope_in, TensorView k_nope_in,
+    TensorView v_in, TensorView q_rope_out, TensorView q_nope_out, TensorView cos_sin_cache,
+    TensorView pos_ids,
+    // Paged cache tensors
+    TensorView k_cache, TensorView v_cache, TensorView ckv_cache, TensorView kpe_cache,
+    TensorView kv_indices, TensorView kv_indptr, TensorView batch_indices, TensorView positions,
+    int64_t kv_layout_code, int64_t page_size, double quant_scale_q, double quant_scale_kv,
+    bool interleave, bool enable_pdl) {
+  // Validate inputs
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(q_rope_in);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(k_rope_in);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(q_nope_in);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(k_nope_in);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(q_rope_out);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(q_nope_out);
+  CHECK_INPUT(cos_sin_cache);
+  CHECK_INPUT(pos_ids);
+  CHECK_INPUT(kv_indices);
+  CHECK_INPUT(kv_indptr);
+  CHECK_INPUT(batch_indices);
+  CHECK_INPUT(positions);
+
+  // Extract dimensions
+  uint32_t rope_dim = q_rope_in.size(-1);
+  uint32_t no_rope_dim = q_nope_in.size(-1);
+  uint32_t nnz = q_rope_in.size(0);
+  uint32_t num_qo_heads = q_rope_in.size(1);
+
+  // Validate dimensions
+  TVM_FFI_ICHECK_EQ(k_rope_in.size(-1), rope_dim);
+  TVM_FFI_ICHECK_EQ(k_nope_in.size(-1), no_rope_dim);
+  TVM_FFI_ICHECK_EQ(q_rope_out.size(-1), rope_dim);
+  TVM_FFI_ICHECK_EQ(q_nope_out.size(-1), no_rope_dim);
+  TVM_FFI_ICHECK_EQ(q_rope_in.dtype(), k_rope_in.dtype());
+  TVM_FFI_ICHECK_EQ(q_rope_in.dtype(), q_nope_in.dtype());
+  TVM_FFI_ICHECK_EQ(q_rope_in.dtype(), k_nope_in.dtype());
+
+  // Validate input/output dtypes
+  TVM_FFI_ICHECK(q_rope_in.dtype() == dl_float16 || q_rope_in.dtype() == dl_bfloat16)
+      << "Input dtype must be float16 or bfloat16";
+  TVM_FFI_ICHECK(q_rope_out.dtype() == dl_float8_e4m3fn || q_rope_out.dtype() == dl_float8_e5m2)
+      << "Output dtype must be float8_e4m3fn or float8_e5m2";
+
+  // Q tensors are always 3D
+  CHECK_DIM(3, q_rope_in);
+  CHECK_DIM(3, q_nope_in);
+  CHECK_DIM(3, q_rope_out);
+  CHECK_DIM(3, q_nope_out);
+
+  // Detect architecture based on cache presence/layout (not K dimensionality)
+  QKVLayout kv_layout = QKVLayout(kv_layout_code);
+  bool has_mla_caches = (ckv_cache.data_ptr() != nullptr && kpe_cache.data_ptr() != nullptr);
+  bool has_gqa_caches = (k_cache.data_ptr() != nullptr && v_cache.data_ptr() != nullptr);
+  bool is_mla = has_mla_caches && !has_gqa_caches;
+  uint32_t num_kv_heads;
+  uint32_t batch_size = kv_indptr.size(0) - 1;
+
+  // Require 3D K tensors in both paths; for MLA head dim must be 1
+  CHECK_DIM(3, k_rope_in);
+  CHECK_DIM(3, k_nope_in);
+  if (is_mla) {
+    num_kv_heads = 1;
+    TVM_FFI_ICHECK_EQ(k_rope_in.size(1), 1) << "MLA expects K rope head dim == 1";
+    TVM_FFI_ICHECK_EQ(k_nope_in.size(1), 1) << "MLA expects K nope head dim == 1";
+    // V can be empty for MLA
+    TVM_FFI_ICHECK(v_in.data_ptr() == nullptr || v_in.size(0) == 0)
+        << "MLA should not have V input (or it should be empty)";
+    // Validate MLA cache tensors are provided
+    TVM_FFI_ICHECK(ckv_cache.data_ptr() != nullptr && kpe_cache.data_ptr() != nullptr)
+        << "MLA requires ckv_cache and kpe_cache";
+    CHECK_DIM(3, ckv_cache);  // (max_pages, page_size, ckv_dim)
+    CHECK_DIM(3, kpe_cache);  // (max_pages, page_size, kpe_dim)
+    TVM_FFI_ICHECK_EQ(ckv_cache.size(2), no_rope_dim);
+    TVM_FFI_ICHECK_EQ(kpe_cache.size(2), rope_dim);
+  } else {
+    // GQA/MHA validation
+    num_kv_heads = k_rope_in.size(1);
+    TVM_FFI_ICHECK_EQ(k_nope_in.size(1), num_kv_heads);
+    // V is required for GQA/MHA
+    CHECK_DIM(3, v_in);
+    TVM_FFI_ICHECK_EQ(v_in.size(0), nnz);
+    TVM_FFI_ICHECK_EQ(v_in.size(1), num_kv_heads);
+    // Validate GQA/MHA cache tensors are provided
+    TVM_FFI_ICHECK(k_cache.data_ptr() != nullptr && v_cache.data_ptr() != nullptr)
+        << "GQA/MHA requires k_cache and v_cache";
+    // Cache must be 4D
+    CHECK_DIM(4, k_cache);
+    CHECK_DIM(4, v_cache);
+  }
+
+  // Extract Q strides
+  const uint32_t q_rope_in_stride_n = q_rope_in.stride(0);
+  const uint32_t q_rope_in_stride_h = q_rope_in.stride(1);
+  const uint32_t q_nope_in_stride_n = q_nope_in.stride(0);
+  const uint32_t q_nope_in_stride_h = q_nope_in.stride(1);
+  const uint32_t q_rope_out_stride_n = q_rope_out.stride(0);
+  const uint32_t q_rope_out_stride_h = q_rope_out.stride(1);
+  const uint32_t q_nope_out_stride_n = q_nope_out.stride(0);
+  const uint32_t q_nope_out_stride_h = q_nope_out.stride(1);
+
+  // Extract K strides
+  uint32_t k_rope_in_stride, k_nope_in_stride;
+  uint32_t k_rope_in_stride_h, k_nope_in_stride_h;
+  uint32_t v_in_stride = 0, v_in_stride_h = 0;
+
+  k_rope_in_stride = k_rope_in.stride(0);
+  k_nope_in_stride = k_nope_in.stride(0);
+  k_rope_in_stride_h = k_rope_in.stride(1);
+  k_nope_in_stride_h = k_nope_in.stride(1);
+  if (!is_mla) {
+    v_in_stride = v_in.stride(0);
+    v_in_stride_h = v_in.stride(1);
+  }
+
+  ffi::CUDADeviceGuard device_guard(q_rope_in.device().device_id);
+  const cudaStream_t stream = get_stream(q_rope_in.device());
+
+  DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(q_rope_in.dtype(), c_type, [&] {
+    return DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP8(q_rope_out.dtype(), c_quant_type, [&] {
+      cudaError_t status;
+
+      if (is_mla) {
+        // MLA: Construct paged_kv_mla_t struct
+        auto ckv_strides = ckv_cache.strides();
+        auto kpe_strides = kpe_cache.strides();
+
+        paged_kv_mla_t<c_quant_type, int32_t> paged_kv_mla(
+            page_size, no_rope_dim, rope_dim, batch_size,
+            static_cast<c_quant_type*>(ckv_cache.data_ptr()), ckv_strides.data(),
+            static_cast<c_quant_type*>(kpe_cache.data_ptr()), kpe_strides.data(),
+            static_cast<int32_t*>(kv_indices.data_ptr()),
+            static_cast<int32_t*>(kv_indptr.data_ptr()),
+            nullptr  // last_page_len not needed for this kernel
+        );
+
+        status = RopeQuantizeAppendPagedMLACache(
+            static_cast<c_type*>(q_rope_in.data_ptr()), static_cast<c_type*>(k_rope_in.data_ptr()),
+            static_cast<c_type*>(q_nope_in.data_ptr()), static_cast<c_type*>(k_nope_in.data_ptr()),
+            static_cast<c_quant_type*>(q_rope_out.data_ptr()),
+            static_cast<c_quant_type*>(q_nope_out.data_ptr()), paged_kv_mla,
+            static_cast<int32_t*>(batch_indices.data_ptr()),
+            static_cast<int32_t*>(positions.data_ptr()),
+            static_cast<float*>(cos_sin_cache.data_ptr()),
+            static_cast<int32_t*>(pos_ids.data_ptr()), nnz, num_qo_heads, rope_dim, no_rope_dim,
+            q_rope_in_stride_n, q_rope_in_stride_h, q_nope_in_stride_n, q_nope_in_stride_h,
+            q_rope_out_stride_n, q_rope_out_stride_h, q_nope_out_stride_n, q_nope_out_stride_h,
+            k_rope_in_stride, k_nope_in_stride, quant_scale_q, quant_scale_kv, interleave,
+            enable_pdl, stream);
+
+      } else {
+        // GQA/MHA: Construct paged_kv_t struct
+        auto k_strides = k_cache.strides();
+        auto v_strides = v_cache.strides();
+        uint32_t head_dim = rope_dim + no_rope_dim;
+
+        paged_kv_t<c_quant_type, int32_t> paged_kv(
+            num_kv_heads, page_size, head_dim, batch_size, kv_layout,
+            static_cast<c_quant_type*>(k_cache.data_ptr()),
+            static_cast<c_quant_type*>(v_cache.data_ptr()), k_strides.data(),
+            static_cast<int32_t*>(kv_indices.data_ptr()),
+            static_cast<int32_t*>(kv_indptr.data_ptr()),
+            nullptr  // last_page_len not needed for this kernel
+        );
+
+        status = RopeQuantizeAppendPagedKVCache(
+            static_cast<c_type*>(q_rope_in.data_ptr()), static_cast<c_type*>(k_rope_in.data_ptr()),
+            static_cast<c_type*>(q_nope_in.data_ptr()), static_cast<c_type*>(k_nope_in.data_ptr()),
+            static_cast<c_type*>(v_in.data_ptr()),
+            static_cast<c_quant_type*>(q_rope_out.data_ptr()),
+            static_cast<c_quant_type*>(q_nope_out.data_ptr()), paged_kv,
+            static_cast<int32_t*>(batch_indices.data_ptr()),
+            static_cast<int32_t*>(positions.data_ptr()),
+            static_cast<float*>(cos_sin_cache.data_ptr()),
+            static_cast<int32_t*>(pos_ids.data_ptr()), nnz, num_qo_heads, num_kv_heads, rope_dim,
+            no_rope_dim, q_rope_in_stride_n, q_rope_in_stride_h, q_nope_in_stride_n,
+            q_nope_in_stride_h, q_rope_out_stride_n, q_rope_out_stride_h, q_nope_out_stride_n,
+            q_nope_out_stride_h, k_rope_in_stride, k_rope_in_stride_h, k_nope_in_stride,
+            k_nope_in_stride_h, v_in_stride, v_in_stride_h, quant_scale_q, quant_scale_kv,
+            interleave, enable_pdl, stream);
+      }
+
+      TVM_FFI_ICHECK(status == cudaSuccess)
+          << "RopeQuantizeAppendPagedKVCache failed with error code " << cudaGetErrorString(status);
+      return true;
+    });
+  });
+}
diff --git a/csrc/sampling.cu b/csrc/sampling.cu
index 7210ecb440..4a4d17ad28 100644
--- a/csrc/sampling.cu
+++ b/csrc/sampling.cu
@@ -15,6 +15,7 @@
  */
 #include <flashinfer/sampling.cuh>
 
+#include "sampling_utils.h"
 #include "tvm_ffi_utils.h"
 
 using namespace flashinfer;
@@ -32,7 +33,7 @@ void softmax(TensorView workspace_buffer, TensorView logits, TensorView output,
 
   bool has_temperature_arr = maybe_temperature_arr.has_value();
 
-  cudaSetDevice(logits.device().device_id);
+  ffi::CUDADeviceGuard device_guard(logits.device().device_id);
   auto stream = get_stream(logits.device());
   cudaError_t status = sampling::OnlineSoftmax<float>(
       static_cast<float*>(logits.data_ptr()), static_cast<float*>(output.data_ptr()), batch_size,
@@ -48,10 +49,11 @@ void sampling_from_logits(TensorView logits, TensorView output, Optional<TensorV
                           bool deterministic, uint64_t philox_seed, uint64_t philox_offset) {
   CHECK_INPUT(logits);
   CHECK_DIM(2, logits);  // logits: (batch_size, vocab_size)
+  CHECK_MAYBE_INPUT_TYPE(maybe_indices, dl_int32);
   unsigned int batch_size = output.size(0);
   unsigned int vocab_size = logits.size(1);
 
-  cudaSetDevice(logits.device().device_id);
+  ffi::CUDADeviceGuard device_guard(logits.device().device_id);
   auto stream = get_stream(logits.device());
   cudaError_t status = sampling::SamplingFromLogits(
       static_cast<float*>(logits.data_ptr()), static_cast<int*>(output.data_ptr()),
@@ -65,10 +67,11 @@ void sampling_from_probs(TensorView probs, TensorView output, Optional<TensorVie
                          bool deterministic, uint64_t philox_seed, uint64_t philox_offset) {
   CHECK_INPUT(probs);
   CHECK_DIM(2, probs);  // probs: (batch_size, vocab_size)
+  CHECK_MAYBE_INPUT_TYPE(maybe_indices, dl_int32);
   unsigned int batch_size = output.size(0);
   unsigned int vocab_size = probs.size(1);
 
-  cudaSetDevice(probs.device().device_id);
+  ffi::CUDADeviceGuard device_guard(probs.device().device_id);
   auto stream = get_stream(probs.device());
   cudaError_t status = sampling::SamplingFromProb(
       static_cast<float*>(probs.data_ptr()), static_cast<int*>(output.data_ptr()),
@@ -84,11 +87,13 @@ void top_p_sampling_from_probs(TensorView probs, TensorView output,
                                bool deterministic, uint64_t philox_seed, uint64_t philox_offset) {
   CHECK_INPUT(probs);
   CHECK_DIM(2, probs);  // probs: (batch_size, vocab_size)
+  CHECK_MAYBE_INPUT_TYPE(maybe_indices, dl_int32);
   unsigned int batch_size = output.size(0);
   unsigned int vocab_size = probs.size(1);
+  check_tensor_param(maybe_top_p_arr, probs);
   bool has_top_p_arr = maybe_top_p_arr.has_value();
 
-  cudaSetDevice(probs.device().device_id);
+  ffi::CUDADeviceGuard device_guard(probs.device().device_id);
   auto stream = get_stream(probs.device());
   cudaError_t status = sampling::TopPSamplingFromProb<float, int>(
       static_cast<float*>(probs.data_ptr()), static_cast<int*>(output.data_ptr()),
@@ -108,11 +113,13 @@ void top_k_sampling_from_probs(TensorView probs, TensorView output,
   CHECK_DEVICE(output, probs);
   CHECK_DIM(2, probs);   // probs: (batch_size, vocab_size)
   CHECK_DIM(1, output);  // output: (batch_size)
+  CHECK_MAYBE_INPUT_TYPE(maybe_indices, dl_int32);
   unsigned int batch_size = output.size(0);
   unsigned int vocab_size = probs.size(1);
+  check_tensor_param(maybe_top_k_arr, probs);
   bool has_top_k_arr = maybe_top_k_arr.has_value();
 
-  cudaSetDevice(probs.device().device_id);
+  ffi::CUDADeviceGuard device_guard(probs.device().device_id);
   auto stream = get_stream(probs.device());
   cudaError_t status = sampling::TopKSamplingFromProb<float, int>(
       static_cast<float*>(probs.data_ptr()), static_cast<int*>(output.data_ptr()),
@@ -132,11 +139,13 @@ void min_p_sampling_from_probs(TensorView probs, TensorView output,
   CHECK_DEVICE(output, probs);
   CHECK_DIM(2, probs);   // probs: (batch_size, vocab_size)
   CHECK_DIM(1, output);  // output: (batch_size)
+  CHECK_MAYBE_INPUT_TYPE(maybe_indices, dl_int32);
   unsigned int batch_size = output.size(0);
   unsigned int vocab_size = probs.size(1);
+  check_tensor_param(maybe_min_p_arr, probs);
   bool has_min_p_arr = maybe_min_p_arr.has_value();
 
-  cudaSetDevice(probs.device().device_id);
+  ffi::CUDADeviceGuard device_guard(probs.device().device_id);
   auto stream = get_stream(probs.device());
   cudaError_t status = sampling::MinPSamplingFromProb<float, int>(
       static_cast<float*>(probs.data_ptr()),
@@ -159,12 +168,15 @@ void top_k_top_p_sampling_from_probs(TensorView probs, TensorView output,
   CHECK_DEVICE(output, probs);
   CHECK_DIM(2, probs);   // probs: (batch_size, vocab_size)
   CHECK_DIM(1, output);  // output: (batch_size)
+  CHECK_MAYBE_INPUT_TYPE(maybe_indices, dl_int32);
   unsigned int batch_size = output.size(0);
   unsigned int vocab_size = probs.size(1);
+  check_tensor_param(maybe_top_k_arr, probs);
+  check_tensor_param(maybe_top_p_arr, probs);
   bool has_top_k_arr = maybe_top_k_arr.has_value();
   bool has_top_p_arr = maybe_top_p_arr.has_value();
 
-  cudaSetDevice(probs.device().device_id);
+  ffi::CUDADeviceGuard device_guard(probs.device().device_id);
   auto stream = get_stream(probs.device());
   cudaError_t status = sampling::TopKTopPSamplingFromProb<float, int>(
       static_cast<float*>(probs.data_ptr()),
@@ -201,7 +213,7 @@ void chain_speculative_sampling(TensorView draft_probs, TensorView draft_token_i
   TVM_FFI_ICHECK_EQ(batch_size, output_accepted_token_num.size(0));
   TVM_FFI_ICHECK_EQ(batch_size, output_emitted_draft_token_num.size(0));
 
-  cudaSetDevice(draft_probs.device().device_id);
+  ffi::CUDADeviceGuard device_guard(draft_probs.device().device_id);
   auto stream = get_stream(draft_probs.device());
   cudaError_t status = sampling::ChainSpeculativeSampling<float, int>(
       static_cast<float*>(draft_probs.data_ptr()), static_cast<int*>(draft_token_ids.data_ptr()),
diff --git a/csrc/sampling_utils.h b/csrc/sampling_utils.h
new file mode 100644
index 0000000000..c47746753d
--- /dev/null
+++ b/csrc/sampling_utils.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/error.h>
+
+using tvm::ffi::Optional;
+using tvm::ffi::TensorView;
+
+// Helper to validate sampling parameters
+inline void check_tensor_param(const Optional<TensorView>& maybe_param, const TensorView& tensor) {
+  if (maybe_param.has_value()) {
+    const TensorView& param = maybe_param.value();
+    if (param.ndim() == 0) {
+      TVM_FFI_THROW(ValueError)
+          << "Expected a 1D tensor of shape (batch_size,) or scalar for the sampling parameter, "
+          << "but got a 0-dimensional tensor.";
+    } else if (param.ndim() > 1) {
+      TVM_FFI_THROW(ValueError) << "Expected a 1D tensor or scalar for the sampling parameter, "
+                                << "but got a " << param.ndim() << "D tensor.";
+    } else if (param.size(0) != tensor.size(0)) {
+      TVM_FFI_THROW(ValueError) << "Sampling parameter tensor batch size mismatch: "
+                                << "expected length " << tensor.size(0)
+                                << " to match the reference tensor batch size, "
+                                << "but got length " << param.size(0) << ".";
+    }
+  }
+}
diff --git a/csrc/single_decode.cu b/csrc/single_decode.cu
index 84f36d8388..f673b0e4e4 100644
--- a/csrc/single_decode.cu
+++ b/csrc/single_decode.cu
@@ -62,7 +62,7 @@ void single_decode_with_kv_cache(TensorView q, TensorView k, TensorView v, Tenso
       << "num_qo_heads(" << num_qo_heads << ") must be divisible by num_kv_heads(" << num_kv_heads
       << ")";
 
-  cudaSetDevice(q.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q.device().device_id);
   const cudaStream_t stream = get_stream(q.device());
 
   TVM_FFI_ICHECK_EQ(head_dim_qk, head_dim_vo)
diff --git a/csrc/single_prefill.cu b/csrc/single_prefill.cu
index 1d66db57ba..50571d12b6 100644
--- a/csrc/single_prefill.cu
+++ b/csrc/single_prefill.cu
@@ -68,7 +68,7 @@ void single_prefill_with_kv_cache(ffi::TensorView q, ffi::TensorView k, ffi::Ten
 
   const MaskMode mask_mode = static_cast<MaskMode>(mask_mode_code);
 
-  cudaSetDevice(q.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q.device().device_id);
   const cudaStream_t stream = get_stream(q.device());
 
   DISPATCH_context(
diff --git a/csrc/single_prefill_fp8_sm90.cu b/csrc/single_prefill_fp8_sm90.cu
index d6830e2863..94403a80d5 100644
--- a/csrc/single_prefill_fp8_sm90.cu
+++ b/csrc/single_prefill_fp8_sm90.cu
@@ -42,7 +42,7 @@ void single_prefill_with_kv_cache_sm90(ffi::TensorView q, ffi::TensorView k, ffi
   unsigned int qo_len = q.size(0);
 
   QKVLayout kv_layout = static_cast<QKVLayout>(layout);
-  cudaSetDevice(q.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q.device().device_id);
   const cudaStream_t stream = get_stream(q.device());
   const MaskMode mask_mode = static_cast<MaskMode>(mask_mode_code);
 
diff --git a/csrc/single_prefill_sm90.cu b/csrc/single_prefill_sm90.cu
index b1f757d462..aa8158561a 100644
--- a/csrc/single_prefill_sm90.cu
+++ b/csrc/single_prefill_sm90.cu
@@ -42,7 +42,7 @@ void single_prefill_with_kv_cache_sm90(ffi::TensorView q, ffi::TensorView k, ffi
   unsigned int qo_len = q.size(0);
 
   QKVLayout kv_layout = static_cast<QKVLayout>(layout);
-  cudaSetDevice(q.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q.device().device_id);
   const cudaStream_t stream = get_stream(q.device());
   const MaskMode mask_mode = static_cast<MaskMode>(mask_mode_code);
 
diff --git a/csrc/trtllm_allreduce.cu b/csrc/trtllm_allreduce.cu
index e985f50403..4fb7915b9e 100644
--- a/csrc/trtllm_allreduce.cu
+++ b/csrc/trtllm_allreduce.cu
@@ -96,7 +96,7 @@ void trtllm_custom_all_reduce(TensorView in, TensorView out, int64_t tp_size, in
                               Optional<TensorView> lamport_peer_comm_buffer_ptrs_1,
                               Optional<TensorView> lamport_peer_comm_buffer_ptrs_2) {
   AllReduceFusionOp fusion_op = static_cast<AllReduceFusionOp>(fusion_op_code);
-  cudaSetDevice(in.device().device_id);
+  ffi::CUDADeviceGuard device_guard(in.device().device_id);
   auto stream = get_stream(in.device());
 
   // TODO(zihao): review dispatch type - support fp16, bf16 only
diff --git a/csrc/trtllm_allreduce_fusion.cu b/csrc/trtllm_allreduce_fusion.cu
index c0f74194e4..46cd98913a 100644
--- a/csrc/trtllm_allreduce_fusion.cu
+++ b/csrc/trtllm_allreduce_fusion.cu
@@ -37,7 +37,7 @@ void trtllm_allreduce_fusion(TensorView allreduce_in, int64_t world_size, int64_
                              Optional<TensorView> quant_out, Optional<TensorView> scale_out,
                              Optional<TensorView> rms_gamma, Optional<double> rms_eps,
                              Optional<TensorView> scale_factor, Optional<int64_t> layout_code) {
-  cudaSetDevice(allreduce_in.device().device_id);
+  ffi::CUDADeviceGuard device_guard(allreduce_in.device().device_id);
   // todo(Yingyi): add dispatch for float and bfloat16
 
   DISPATCH_FLOATING_TYPES_FOR_ALLREDUCE(allreduce_in.dtype(), c_type, [&] {
diff --git a/csrc/trtllm_alltoall.cu b/csrc/trtllm_alltoall.cu
index 9ad4fff110..005fe10f1e 100644
--- a/csrc/trtllm_alltoall.cu
+++ b/csrc/trtllm_alltoall.cu
@@ -296,7 +296,7 @@ void moePrepareOp(TensorView expertsIds, Optional<TensorView> scales,
     CHECK_INPUT_TYPE(scales.value(), dl_float32);
     scalesPtr = static_cast<float*>(scales.value().data_ptr());
     CHECK_DEVICE(preparedLocalScales.value(), expertsIds);
-    CHECK_INPUT_TYPE(preparedLocalScales.value(), dl_int32);
+    CHECK_INPUT_TYPE(preparedLocalScales.value(), dl_float32);
     TVM_FFI_ICHECK_EQ(preparedLocalScales.value().ndim(), 2);
     TVM_FFI_ICHECK_EQ(preparedLocalScales.value().size(0), maxTokenCountPerRank * epSize);
     TVM_FFI_ICHECK_EQ(preparedLocalScales.value().size(1), topK);
diff --git a/csrc/trtllm_batched_gemm_runner.cu b/csrc/trtllm_batched_gemm_runner.cu
index bf57fd5b9e..cff4db198f 100644
--- a/csrc/trtllm_batched_gemm_runner.cu
+++ b/csrc/trtllm_batched_gemm_runner.cu
@@ -116,14 +116,16 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(
     }
   }
 
-  FLASHINFER_CHECK(
-      !mPassingConfigIndices.empty(),
-      "No kernel found for the given options: mDtypeA: %s, mDtypeB: %s, mDtypeC: %s, "
-      "mUseDeepSeekFp8: %d, "
-      "mTransposeMmaOutput: %d, mRouteAct: %d, mFusedAct: %d, mIsStaticBatch: %d, mTileSize: %d",
-      tg::dtypeToString(mOptions.dtypeA).c_str(), tg::dtypeToString(mOptions.dtypeB).c_str(),
-      tg::dtypeToString(mOptions.dtypeC).c_str(), mOptions.deepSeekFp8, mOptions.transposeMmaOutput,
-      mOptions.routeAct, mOptions.fusedAct, mOptions.staticBatch, mOptions.tileSize);
+  std::ostringstream error_msg;
+  error_msg << "No kernel found for the given options: "
+            << "mDtypeA: " << tg::dtypeToString(mOptions.dtypeA)
+            << ", mDtypeB: " << tg::dtypeToString(mOptions.dtypeB)
+            << ", mDtypeC: " << tg::dtypeToString(mOptions.dtypeC)
+            << ", mUseDeepSeekFp8: " << mOptions.deepSeekFp8
+            << ", mTransposeMmaOutput: " << mOptions.transposeMmaOutput
+            << ", mRouteAct: " << mOptions.routeAct << ", mFusedAct: " << mOptions.fusedAct
+            << ", mIsStaticBatch: " << mOptions.staticBatch << ", mTileSize: " << mOptions.tileSize;
+  FLASHINFER_CHECK(!mPassingConfigIndices.empty(), error_msg.str());
 }
 
 size_t TrtllmGenBatchedGemmRunner::getWorkspaceSizeInBytes(
@@ -144,6 +146,10 @@ size_t TrtllmGenBatchedGemmRunner::getWorkspaceSizeInBytes(
   gemmData.mProblemDimensions.mWorldSize = 1;
   gemmData.mProblemDimensions.mMaxNumCtasInTokenDim = maxNumCtasInBatchDim;
 
+  gemmData.mProblemDimensions.mValidM = gemmData.mProblemDimensions.mM;
+  gemmData.mProblemDimensions.mValidN = gemmData.mProblemDimensions.mN;
+  gemmData.mProblemDimensions.mValidK = gemmData.mProblemDimensions.mK;
+
   auto bmm = BatchedGemmInterface();
 
   auto const configs = bmm.getBatchedGemmConfigs();
@@ -239,6 +245,10 @@ void TrtllmGenBatchedGemmRunner::run(
   int32_t multiProcessorCount;
   cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, device);
 
+  gemmData.mProblemDimensions.mValidM = gemmData.mProblemDimensions.mM;
+  gemmData.mProblemDimensions.mValidN = gemmData.mProblemDimensions.mN;
+  gemmData.mProblemDimensions.mValidK = gemmData.mProblemDimensions.mK;
+
   // FIXME once we start using all-reduce in the epilogue of the bmm this can be moved elsewhere
   bmm.runInitBeforeWorldSync(config, gemmData, static_cast<void*>(stream));
 
@@ -327,6 +337,10 @@ std::vector<int64_t> TrtllmGenBatchedGemmRunner::getValidConfigIndices(
   gemmData.mProblemDimensions.mWorldSize = 1;
   gemmData.mProblemDimensions.mMaxNumCtasInTokenDim = maxNumCtasInBatchDim;
 
+  gemmData.mProblemDimensions.mValidM = gemmData.mProblemDimensions.mM;
+  gemmData.mProblemDimensions.mValidN = gemmData.mProblemDimensions.mN;
+  gemmData.mProblemDimensions.mValidK = gemmData.mProblemDimensions.mK;
+
   auto cmpFunc = [&configs, &gemmData, &bmm, &multiProcessorCount](int64_t idx0, int64_t idx1) {
     auto const& optionsA = configs[idx0].mOptions;
     auto const& optionsB = configs[idx1].mOptions;
@@ -387,8 +401,7 @@ std::vector<int64_t> TrtllmGenBatchedGemmRunner::getValidConfigIndices(
   // Filter out invalid configs.
   std::vector<int64_t> validConfigIndices;
   for (auto const& configIndex : prioritizedIndices) {
-    auto const& config = configs[configIndex];
-    auto isValidConfig = bmm.isValidConfig(config, gemmData);
+    auto isValidConfig = bmm.isValidConfig(configs[configIndex], gemmData);
     if (isValidConfig) {
       validConfigIndices.push_back(configIndex);
     }
@@ -435,7 +448,9 @@ bool TrtllmGenBatchedGemmRunner::isValidConfigIndex(int32_t configIndex, int32_t
 
   auto const& config = configs[configIndex];
 
-  return bmm.isValidConfig(config, gemmData);
+  // FIXME: temporarily disable split-k as renormalize routing plus expert number 256 failed in
+  // trtllm-gen ac83afb
+  return bmm.isValidConfig(config, gemmData) && config.mOptions.mClusterDimZ == 1;
 }
 
 }  // namespace kernels
diff --git a/csrc/trtllm_fmha_kernel_launcher.cu b/csrc/trtllm_fmha_kernel_launcher.cu
index c40e773e64..89fe53b874 100644
--- a/csrc/trtllm_fmha_kernel_launcher.cu
+++ b/csrc/trtllm_fmha_kernel_launcher.cu
@@ -18,6 +18,7 @@
 #include <flashinfer/trtllm/fmha/decoder_impl_common.h>
 #include <flashinfer/trtllm/fmha/fmhaRunnerParams.h>
 #include <nvrtc.h>
+#include <tvm/ffi/container/variant.h>
 
 #include <flashinfer/trtllm/fmha/fmhaRunner.cuh>
 #include <flashinfer/utils.cuh>
@@ -28,6 +29,7 @@
 #include "tvm_ffi_utils.h"
 
 using tvm::ffi::Optional;
+using tvm::ffi::Variant;
 
 namespace flashinfer {
 
@@ -78,8 +80,9 @@ void trtllm_paged_attention_launcher(
     int64_t max_kv_len, int64_t num_pages_in_mem_pool, int64_t num_qo_heads, int64_t num_kv_heads,
     int64_t head_dim_qk, int64_t head_dim_vo, int64_t page_size, int64_t kv_stride_keys_values,
     int64_t kv_stride_heads, int64_t kv_stride_batch, int64_t max_num_blocks_per_seq,
-    double bmm1_scale, double bmm2_scale, double o_sf_scale, int64_t o_sf_vec_size,
-    int64_t o_sf_start_index, int64_t window_left, int64_t sum_seq_q, int64_t sm_count,
+    double bmm1_scale, double bmm2_scale, const float* bmm1_scale_log2_ptr,
+    const float* bmm2_scale_ptr, double o_sf_scale, int64_t o_sf_vec_size, int64_t o_sf_start_index,
+    int64_t window_left, int64_t sum_seq_q, int64_t sparse_mla_top_k, int64_t sm_count,
     bool enable_pdl, int64_t workspace_size, cudaStream_t stream) {
   if (num_qo_heads % num_kv_heads != 0) {
     std::ostringstream err_msg;
@@ -117,8 +120,12 @@ void trtllm_paged_attention_launcher(
   runner_params.vStrideBatch = kv_stride_batch;
   runner_params.mNumPagesInMemPool = num_pages_in_mem_pool;
   runner_params.stream = stream;
+  // the scaleSoftmaxLog2Ptr and outputScalePtr have higher priority than the scaleSoftmaxLog2 and
+  // outputScale. if they are not nullptr, then scaleSoftmaxLog2 and outputScale will be ignored
   runner_params.outputScale = bmm2_scale;
+  runner_params.outputScalePtr = bmm2_scale_ptr;
   runner_params.scaleSoftmaxLog2 = bmm1_scale * M_LOG2E;
+  runner_params.scaleSoftmaxLog2Ptr = bmm1_scale_log2_ptr;
   runner_params.oSfPtr = out_scale_factor;
   runner_params.mSfStartTokenIdx = o_sf_start_index;
   runner_params.mScaleSfO = o_sf_scale;
@@ -132,6 +139,12 @@ void trtllm_paged_attention_launcher(
   runner_params.ptrAttentionSinks = attention_sinks;
   runner_params.enable_pdl = enable_pdl;
 
+  // The sparse MLA parameters.
+  runner_params.mSparseMla = sparse_mla_top_k > 0;
+  runner_params.mSparseMlaTopK = sparse_mla_top_k;
+  TVM_FFI_ICHECK((head_dim_qk == 576 && head_dim_vo == 512) || sparse_mla_top_k <= 0)
+      << "Only decode MLA supports sparse MLA";
+
   AlignedAllocator float_allocator(workspace_buffer, workspace_size);
   if (mode == TllmPagedAttentionMode::Context) {
     runner_params.mMaskType = TrtllmGenAttentionMaskType::Causal;
@@ -194,14 +207,13 @@ inline Data_type dl_dtype_to_tllm_data_type(const DLDataType dtype) {
 
 inline bool is_4bit(Data_type data_type) { return data_type == Data_type::DATA_TYPE_E2M1; }
 
-void trtllm_paged_attention_decode(TensorView out, Optional<TensorView> out_scale_factor,
-                                   TensorView query, TensorView key_cache, TensorView value_cache,
-                                   TensorView workspace_buffer, TensorView block_tables,
-                                   TensorView seq_lens, int64_t max_kv_len, double bmm1_scale,
-                                   double bmm2_scale, double o_sf_scale, int64_t o_sf_vec_size,
-                                   int64_t o_sf_start_index, int64_t window_left, int64_t sm_count,
-                                   bool enable_pdl, int64_t workspace_size,
-                                   Optional<TensorView> attention_sinks) {
+void trtllm_paged_attention_decode(
+    TensorView out, Optional<TensorView> out_scale_factor, TensorView query, TensorView key_cache,
+    TensorView value_cache, TensorView workspace_buffer, TensorView block_tables,
+    TensorView seq_lens, int64_t max_kv_len, Variant<double, ffi::Tensor> bmm1_scale,
+    Variant<double, ffi::Tensor> bmm2_scale, double o_sf_scale, int64_t o_sf_vec_size,
+    int64_t o_sf_start_index, int64_t window_left, int64_t sparse_mla_top_k, int64_t sm_count,
+    bool enable_pdl, int64_t workspace_size, Optional<TensorView> attention_sinks) {
   auto q_data_type = dl_dtype_to_tllm_data_type(query.dtype());
   auto kv_data_type = dl_dtype_to_tllm_data_type(key_cache.dtype());
   TVM_FFI_ICHECK_EQ(key_cache.ndim(), value_cache.ndim());
@@ -228,15 +240,17 @@ void trtllm_paged_attention_decode(TensorView out, Optional<TensorView> out_scal
   TVM_FFI_ICHECK((head_dim_v == 576 && head_dim_o == 512) || head_dim_v == head_dim_o)
       << "head_dim_v and head_dim_o must be the same for non-MLA attention, got "
       << std::to_string(head_dim_v) << " and " << std::to_string(head_dim_o);
-  int page_size = key_cache.size(-2);
-  int num_kv_heads = key_cache.size(-3);
   int max_num_blocks_per_seq = block_tables.size(-1);
   bool is_shared_kv = key_cache.data_ptr() == value_cache.data_ptr();
   int num_pages_in_mem_pool = is_shared_kv ? key_cache.size(0) : key_cache.size(0) * 2;
 
+  // Assume NHD layout: [..., H, N, D]
+  int page_size = key_cache.size(-2);
+  int num_kv_heads = key_cache.size(-3);
   int kv_stride_keys_values = key_cache.stride(-2);  // key/values
   int kv_stride_heads = key_cache.stride(-3);        // head
-  int kv_stride_batch = key_cache.stride(0);         // batch
+
+  int kv_stride_batch = key_cache.stride(0);  // batch
 
   const auto stream = get_stream(query.device());
   void* output_sf_ptr =
@@ -248,7 +262,25 @@ void trtllm_paged_attention_decode(TensorView out, Optional<TensorView> out_scal
         << "attention_sinks must be a float tensor";
     attention_sinks_ptr = static_cast<float*>(attention_sinks.value().data_ptr());
   }
-
+  auto maybe_bmm1_scale_value = bmm1_scale.as<double>();
+  auto maybe_bmm2_scale_value = bmm2_scale.as<double>();
+  auto maybe_bmm1_scale_log2_tensor = bmm1_scale.as<ffi::Tensor>();
+  auto maybe_bmm2_scale_tensor = bmm2_scale.as<ffi::Tensor>();
+  TVM_FFI_CHECK(maybe_bmm1_scale_value.has_value() || maybe_bmm1_scale_log2_tensor.has_value(),
+                "bmm1_scale must be either a double or a tensor");
+  TVM_FFI_CHECK(maybe_bmm2_scale_value.has_value() || maybe_bmm2_scale_tensor.has_value(),
+                "bmm2_scale must be either a double or a tensor");
+  double bmm1_scale_value =
+      maybe_bmm1_scale_value.has_value() ? maybe_bmm1_scale_value.value() : 1.0;
+  double bmm2_scale_value =
+      maybe_bmm2_scale_value.has_value() ? maybe_bmm2_scale_value.value() : 1.0;
+  float* bmm1_scale_log2_ptr =
+      maybe_bmm1_scale_log2_tensor.has_value()
+          ? static_cast<float*>(maybe_bmm1_scale_log2_tensor.value().data_ptr())
+          : nullptr;
+  float* bmm2_scale_ptr = maybe_bmm2_scale_tensor.has_value()
+                              ? static_cast<float*>(maybe_bmm2_scale_tensor.value().data_ptr())
+                              : nullptr;
   trtllm_paged_attention_launcher(
       out.data_ptr(), output_sf_ptr, query.data_ptr(), key_cache.data_ptr(), value_cache.data_ptr(),
       workspace_buffer.data_ptr(), static_cast<int*>(block_tables.data_ptr()),
@@ -257,21 +289,20 @@ void trtllm_paged_attention_decode(TensorView out, Optional<TensorView> out_scal
       /*cum_seq_lens_kv=*/nullptr, attention_sinks_ptr, q_data_type, kv_data_type, o_data_type,
       TllmPagedAttentionMode::ForGen, batch_size, /*max_q_len=*/q_len_per_request, max_kv_len,
       num_pages_in_mem_pool, num_qo_heads, num_kv_heads, head_dim_q, head_dim_o, page_size,
-      kv_stride_keys_values, kv_stride_heads, kv_stride_batch, max_num_blocks_per_seq, bmm1_scale,
-      bmm2_scale, o_sf_scale, o_sf_vec_size, o_sf_start_index, window_left, sum_seq_q, sm_count,
+      kv_stride_keys_values, kv_stride_heads, kv_stride_batch, max_num_blocks_per_seq,
+      bmm1_scale_value, bmm2_scale_value, bmm1_scale_log2_ptr, bmm2_scale_ptr, o_sf_scale,
+      o_sf_vec_size, o_sf_start_index, window_left, sum_seq_q, sparse_mla_top_k, sm_count,
       enable_pdl, workspace_size, stream);
 }
 
-void trtllm_paged_attention_context(TensorView out, Optional<TensorView> out_scale_factor,
-                                    TensorView query, TensorView key_cache, TensorView value_cache,
-                                    TensorView workspace_buffer, TensorView block_tables,
-                                    TensorView seq_lens, int64_t max_q_len, int64_t max_kv_len,
-                                    double bmm1_scale, double bmm2_scale, double o_sf_scale,
-                                    int64_t o_sf_vec_size, int64_t o_sf_start_index,
-                                    int64_t batch_size, int64_t window_left,
-                                    TensorView cum_seq_lens_q, TensorView cum_seq_lens_kv,
-                                    int64_t sm_count, bool enable_pdl, int64_t workspace_size,
-                                    Optional<TensorView> attention_sinks) {
+void trtllm_paged_attention_context(
+    TensorView out, Optional<TensorView> out_scale_factor, TensorView query, TensorView key_cache,
+    TensorView value_cache, TensorView workspace_buffer, TensorView block_tables,
+    TensorView seq_lens, int64_t max_q_len, int64_t max_kv_len,
+    Variant<double, ffi::Tensor> bmm1_scale, Variant<double, ffi::Tensor> bmm2_scale,
+    double o_sf_scale, int64_t o_sf_vec_size, int64_t o_sf_start_index, int64_t batch_size,
+    int64_t window_left, TensorView cum_seq_lens_q, TensorView cum_seq_lens_kv, int64_t sm_count,
+    bool enable_pdl, int64_t workspace_size, Optional<TensorView> attention_sinks) {
   auto q_data_type = dl_dtype_to_tllm_data_type(query.dtype());
   auto kv_data_type = dl_dtype_to_tllm_data_type(key_cache.dtype());
   auto o_data_type = dl_dtype_to_tllm_data_type(out.dtype());
@@ -291,9 +322,10 @@ void trtllm_paged_attention_context(TensorView out, Optional<TensorView> out_sca
   int max_num_blocks_per_seq = block_tables.size(-1);
   bool is_shared_kv = key_cache.data_ptr() == value_cache.data_ptr();
   int num_pages_in_mem_pool = is_shared_kv ? key_cache.size(0) : key_cache.size(0) * 2;
+
+  // Assume NHD layout: [..., H, N, D]
   int page_size = key_cache.size(-2);
   int num_kv_heads = key_cache.size(-3);
-
   int kv_stride_keys_values = key_cache.stride(-2);  // key/values
   int kv_stride_heads = key_cache.stride(-3);        // head
   int kv_stride_batch = key_cache.stride(0);         // batch
@@ -309,6 +341,26 @@ void trtllm_paged_attention_context(TensorView out, Optional<TensorView> out_sca
     attention_sinks_ptr = static_cast<float*>(attention_sinks.value().data_ptr());
   }
 
+  auto maybe_bmm1_scale_value = bmm1_scale.as<double>();
+  auto maybe_bmm2_scale_value = bmm2_scale.as<double>();
+  auto maybe_bmm1_scale_log2_tensor = bmm1_scale.as<ffi::Tensor>();
+  auto maybe_bmm2_scale_tensor = bmm2_scale.as<ffi::Tensor>();
+  TVM_FFI_CHECK(maybe_bmm1_scale_value.has_value() || maybe_bmm1_scale_log2_tensor.has_value(),
+                "bmm1_scale must be either a double or a tensor");
+  TVM_FFI_CHECK(maybe_bmm2_scale_value.has_value() || maybe_bmm2_scale_tensor.has_value(),
+                "bmm2_scale must be either a double or a tensor");
+  double bmm1_scale_value =
+      maybe_bmm1_scale_value.has_value() ? maybe_bmm1_scale_value.value() : 1.0;
+  double bmm2_scale_value =
+      maybe_bmm2_scale_value.has_value() ? maybe_bmm2_scale_value.value() : 1.0;
+  float* bmm1_scale_log2_ptr =
+      maybe_bmm1_scale_log2_tensor.has_value()
+          ? static_cast<float*>(maybe_bmm1_scale_log2_tensor.value().data_ptr())
+          : nullptr;
+  float* bmm2_scale_ptr = maybe_bmm2_scale_tensor.has_value()
+                              ? static_cast<float*>(maybe_bmm2_scale_tensor.value().data_ptr())
+                              : nullptr;
+
   trtllm_paged_attention_launcher(
       out.data_ptr(), output_sf_ptr, query.data_ptr(), key_cache.data_ptr(), value_cache.data_ptr(),
       workspace_buffer.data_ptr(), static_cast<int*>(block_tables.data_ptr()),
@@ -318,8 +370,9 @@ void trtllm_paged_attention_context(TensorView out, Optional<TensorView> out_sca
       q_data_type, kv_data_type, o_data_type, TllmPagedAttentionMode::Context, batch_size,
       max_q_len, max_kv_len, num_pages_in_mem_pool, num_qo_heads, num_kv_heads, head_dim_q,
       head_dim_o, page_size, kv_stride_keys_values, kv_stride_heads, kv_stride_batch,
-      max_num_blocks_per_seq, bmm1_scale, bmm2_scale, o_sf_scale, o_sf_vec_size, o_sf_start_index,
-      window_left, sum_seq_q, sm_count, enable_pdl, workspace_size, stream);
+      max_num_blocks_per_seq, bmm1_scale_value, bmm2_scale_value, bmm1_scale_log2_ptr,
+      bmm2_scale_ptr, o_sf_scale, o_sf_vec_size, o_sf_start_index, window_left, sum_seq_q,
+      /*sparse_mla_top_k=*/0, sm_count, enable_pdl, workspace_size, stream);
 }
 
 void trtllm_ragged_attention_launcher(
@@ -328,8 +381,9 @@ void trtllm_ragged_attention_launcher(
     Data_type q_data_type, Data_type kv_data_type, Data_type o_data_type, int64_t max_q_len,
     int64_t max_kv_len, int64_t num_qo_heads, int64_t num_kv_heads, int64_t head_dim_qk,
     int64_t head_dim_v, int64_t sum_seq_q, int64_t sum_seq_kv, double bmm1_scale, double bmm2_scale,
-    double o_sf_scale, int64_t batch_size, int64_t window_left, int64_t sm_count, bool enable_pdl,
-    bool is_causal, int64_t k_stride_keys_values, int64_t k_stride_heads, int64_t k_stride_batch,
+    const float* bmm1_scale_log2_ptr, const float* bmm2_scale_ptr, double o_sf_scale,
+    int64_t batch_size, int64_t window_left, int64_t sm_count, bool enable_pdl, bool is_causal,
+    int64_t k_stride_keys_values, int64_t k_stride_heads, int64_t k_stride_batch,
     int64_t v_stride_keys_values, int64_t v_stride_heads, int64_t v_stride_batch,
     int64_t workspace_size, cudaStream_t stream) {
   if (num_qo_heads % num_kv_heads != 0) {
@@ -357,8 +411,12 @@ void trtllm_ragged_attention_launcher(
   runner_params.mQkvLayout = QkvLayout::SeparateQkv;
   runner_params.mMultiProcessorCount = sm_count;
   runner_params.stream = stream;
+  // the scaleSoftmaxLog2Ptr and outputScalePtr have higher priority than the scaleSoftmaxLog2 and
+  // outputScale. if they are not nullptr, then scaleSoftmaxLog2 and outputScale will be ignored
   runner_params.outputScale = bmm2_scale;
+  runner_params.outputScalePtr = bmm2_scale_ptr;
   runner_params.scaleSoftmaxLog2 = bmm1_scale * M_LOG2E;
+  runner_params.scaleSoftmaxLog2Ptr = bmm1_scale_log2_ptr;
   runner_params.mScaleSfO = o_sf_scale;
   runner_params.mChunkedAttentionSize = INT_MAX;  // disable chunked attention by INT_MAX
   runner_params.mAttentionWindowSize =
@@ -411,12 +469,12 @@ void trtllm_ragged_attention_launcher(
 
 void trtllm_ragged_attention(TensorView out, TensorView query, TensorView key, TensorView value,
                              TensorView workspace_buffer, TensorView seq_lens, int64_t max_q_len,
-                             int64_t max_kv_len, double bmm1_scale, double bmm2_scale,
-                             double o_sf_scale, int64_t batch_size, int64_t window_left,
-                             TensorView cum_seq_lens_q, TensorView cum_seq_lens_kv,
-                             int64_t sm_count, bool enable_pdl, bool is_causal,
-                             int64_t workspace_size, Optional<TensorView> attention_sinks,
-                             Optional<TensorView> lse) {
+                             int64_t max_kv_len, Variant<double, ffi::Tensor> bmm1_scale,
+                             Variant<double, ffi::Tensor> bmm2_scale, double o_sf_scale,
+                             int64_t batch_size, int64_t window_left, TensorView cum_seq_lens_q,
+                             TensorView cum_seq_lens_kv, int64_t sm_count, bool enable_pdl,
+                             bool is_causal, int64_t workspace_size,
+                             Optional<TensorView> attention_sinks, Optional<TensorView> lse) {
   float* attention_sinks_ptr = nullptr;
   if (attention_sinks.has_value()) {
     TVM_FFI_ICHECK_EQ(attention_sinks.value().dtype(), dl_float32)
@@ -450,15 +508,34 @@ void trtllm_ragged_attention(TensorView out, TensorView query, TensorView key, T
   int v_stride_heads = value.stride(1);
   int v_stride_batch = value.numel();
 
+  auto maybe_bmm1_scale_value = bmm1_scale.as<double>();
+  auto maybe_bmm2_scale_value = bmm2_scale.as<double>();
+  auto maybe_bmm1_scale_log2_tensor = bmm1_scale.as<ffi::Tensor>();
+  auto maybe_bmm2_scale_tensor = bmm2_scale.as<ffi::Tensor>();
+  TVM_FFI_CHECK(maybe_bmm1_scale_value.has_value() || maybe_bmm1_scale_log2_tensor.has_value(),
+                "bmm1_scale must be either a double or a tensor");
+  TVM_FFI_CHECK(maybe_bmm2_scale_value.has_value() || maybe_bmm2_scale_tensor.has_value(),
+                "bmm2_scale must be either a double or a tensor");
+  double bmm1_scale_value =
+      maybe_bmm1_scale_value.has_value() ? maybe_bmm1_scale_value.value() : 1.0;
+  double bmm2_scale_value =
+      maybe_bmm2_scale_value.has_value() ? maybe_bmm2_scale_value.value() : 1.0;
+  float* bmm1_scale_log2_ptr =
+      maybe_bmm1_scale_log2_tensor.has_value()
+          ? static_cast<float*>(maybe_bmm1_scale_log2_tensor.value().data_ptr())
+          : nullptr;
+  float* bmm2_scale_ptr = maybe_bmm2_scale_tensor.has_value()
+                              ? static_cast<float*>(maybe_bmm2_scale_tensor.value().data_ptr())
+                              : nullptr;
   trtllm_ragged_attention_launcher(
       out.data_ptr(), query.data_ptr(), key.data_ptr(), value.data_ptr(),
       workspace_buffer.data_ptr(), static_cast<int*>(seq_lens.data_ptr()),
       static_cast<int*>(cum_seq_lens_q.data_ptr()), static_cast<int*>(cum_seq_lens_kv.data_ptr()),
       attention_sinks_ptr, lse_ptr, q_data_type, kv_data_type, o_data_type, max_q_len, max_kv_len,
-      num_qo_heads, num_kv_heads, head_dim_qk, head_dim_v, sum_seq_q, sum_seq_kv, bmm1_scale,
-      bmm2_scale, o_sf_scale, batch_size, window_left, sm_count, enable_pdl, is_causal,
-      k_stride_keys_values, k_stride_heads, k_stride_batch, v_stride_keys_values, v_stride_heads,
-      v_stride_batch, workspace_size, stream);
+      num_qo_heads, num_kv_heads, head_dim_qk, head_dim_v, sum_seq_q, sum_seq_kv, bmm1_scale_value,
+      bmm2_scale_value, bmm1_scale_log2_ptr, bmm2_scale_ptr, o_sf_scale, batch_size, window_left,
+      sm_count, enable_pdl, is_causal, k_stride_keys_values, k_stride_heads, k_stride_batch,
+      v_stride_keys_values, v_stride_heads, v_stride_batch, workspace_size, stream);
 }
 
 namespace trtllm_cubin_loader {
diff --git a/csrc/trtllm_fmha_v2_binding.cu b/csrc/trtllm_fmha_v2_binding.cu
new file mode 100644
index 0000000000..0171b91049
--- /dev/null
+++ b/csrc/trtllm_fmha_v2_binding.cu
@@ -0,0 +1,430 @@
+/*
+ * Copyright (c) 2023-2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_fp8.h>
+#include <cuda_runtime.h>
+#include <float.h>
+#include <fused_multihead_attention.h>
+#include <fused_multihead_attention_utils.h>
+#include <math.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <numeric>
+
+#include "tvm_ffi_utils.h"
+
+using tvm::ffi::Optional;
+
+using Launch_params = bert::Fused_multihead_attention_launch_params;
+using Attention_mask_type = fmha::Attention_mask_type;
+using Attention_input_layout = fmha::Attention_input_layout;
+using Kv_block_array = fmha::Kv_block_array;
+
+extern void run_fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_k_v_192x128_output_bf16_sm120_nl_tiled(
+    const bert::Fused_multihead_attention_params_v2& params,
+    const bert::Fused_multihead_attention_launch_params& launch_params, cudaStream_t stream);
+
+extern void run_fmha_v2_flash_attention_bf16_64_128_S_q_k_v_192x128_sm120_nl_tiled(
+    const bert::Fused_multihead_attention_params_v2& params,
+    const bert::Fused_multihead_attention_launch_params& launch_params, cudaStream_t stream);
+
+extern void run_fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_k_v_192x128_sm120_nl_tiled(
+    const bert::Fused_multihead_attention_params_v2& params,
+    const bert::Fused_multihead_attention_launch_params& launch_params, cudaStream_t stream);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+namespace flashinfer {
+
+static inline void set_params(bert::Fused_multihead_attention_params_v2& params,
+                              const Launch_params launch_params,
+                              // types
+                              Data_type data_type, Data_type acc_type, Data_type output_dtype,
+                              // attention input layout
+                              Attention_input_layout input_layout,
+                              // sizes
+                              const size_t b, const size_t s_q, const size_t s_kv, const size_t h,
+                              const size_t h_kv, const size_t d, const size_t dv,
+                              const size_t total, const size_t num_grouped_heads,
+                              const size_t sliding_window_size, const size_t chunked_attention_size,
+                              // paged kv cache block size.
+                              const size_t tokens_per_block,
+                              // device pointers
+                              void* qkv_packed_d,
+                              // contiguous q.
+                              void* q_d,
+                              // separate k.
+                              void* k_d,
+                              // separate v.
+                              void* v_d,
+                              // contiguous kv.
+                              void* kv_d,
+                              // start address of the paged kv pool.
+                              void* paged_kv_pool_ptr,
+                              // offsets for different blocks in terms of the start address.
+                              int32_t* paged_block_offsets,
+                              // mask input.
+                              void* packed_mask_d, void* cu_mask_rows_d,
+                              // attention sinks.
+                              void* attention_sinks_d, void* cu_kv_seqlens_d, void* cu_q_seqlens_d,
+                              void* o_packed_d, void* p_d, void* s_d, void* softmax_stats_d,
+                              void* scale_bmm2_d,
+                              // scale factors
+                              float const scale_bmm1, float const scale_softmax,
+                              float const scale_bmm2, float const softcapping_scale_bmm1,
+                              // flags
+                              bool const use_int8_scale_max, bool const interleaved,
+                              bool const is_s_padded, bool const has_alibi) {
+  memset(&params, 0, sizeof(params));
+
+  params.o_ptr = o_packed_d;
+  params.o_stride_in_bytes = get_size_in_bytes(h * dv, output_dtype);
+
+  if (interleaved) {
+    params.q_stride_in_bytes = total;
+    params.o_stride_in_bytes = total;
+  }
+
+  if (input_layout == Attention_input_layout::PACKED_QKV) {
+    // For grouped- or multi-query attention (h denotes num_q_heads; h' denotes h_kv):
+    //   qkv_layout = [b, s, [q_hd, k_h'd, v_h'd]]
+    //   qkv_stride = (h+2*h')d * bytes_per_elt
+    // Otherwise:
+    //   qkv_layout = [b, s, 3, h, d] or [b, s, h, 3, d]
+    //   qkv_stride = 3hd * bytes_per_elt
+    params.qkv_ptr = qkv_packed_d;
+    params.q_stride_in_bytes = params.k_stride_in_bytes = params.v_stride_in_bytes =
+        get_size_in_bytes(h * d + h_kv * d + h_kv * dv, data_type);
+  } else {
+    // Layout [B, S, H, D].
+    params.q_ptr = q_d;
+    params.q_stride_in_bytes = get_size_in_bytes(h * d, data_type);
+
+    if (input_layout == Attention_input_layout::CONTIGUOUS_Q_KV) {
+      // Layout [B, S, 2, H, D].
+      params.kv_ptr = kv_d;
+      params.k_stride_in_bytes = params.v_stride_in_bytes =
+          get_size_in_bytes(h_kv * (d + dv), data_type);
+    } else if (input_layout == Attention_input_layout::Q_PAGED_KV) {
+      int max_blocks_per_sequence = (s_kv + tokens_per_block - 1) / tokens_per_block;
+      params.paged_kv_cache =
+          Kv_block_array(b, max_blocks_per_sequence, tokens_per_block,
+                         get_size_in_bytes(tokens_per_block * h_kv * std::gcd(d, dv), data_type),
+                         paged_kv_pool_ptr);
+      params.paged_kv_cache.mBlockOffsets = paged_block_offsets;
+      params.k_stride_in_bytes = get_size_in_bytes(tokens_per_block * d, data_type);
+      params.v_stride_in_bytes = get_size_in_bytes(tokens_per_block * dv, data_type);
+    } else if (input_layout == Attention_input_layout::SEPARATE_Q_K_V) {
+      // Layout [B, S, H_kv, D].
+      params.k_ptr = k_d;
+      // Layout [B, S, H_kv, Dv].
+      params.v_ptr = v_d;
+      params.k_stride_in_bytes = get_size_in_bytes(h_kv * d, data_type);
+      params.v_stride_in_bytes = get_size_in_bytes(h_kv * dv, data_type);
+    }
+  }
+
+  // Packed mask.
+  params.packed_mask_ptr = packed_mask_d;
+  // The N dimension has to be aligned.
+  params.packed_mask_stride_in_bytes =
+      (align_to(int64_t(s_kv), int64_t(fmha::FLASH_ATTEN_MASK_N_ALIGNMENT))) / 8;
+
+  // Attention sinks.
+  params.attention_sinks = reinterpret_cast<float*>(attention_sinks_d);
+
+#if defined(STORE_P)
+  params.p_ptr = p_d;
+  params.p_stride_in_bytes = get_size_in_bytes(b * h * s_kv, acc_type);
+#endif  // defined(STORE_P)
+
+#if defined(STORE_S)
+  params.s_ptr = s_d;
+  params.s_stride_in_bytes = get_size_in_bytes(b * h * s_kv, data_type);
+#endif  // defined(STORE_S)
+
+  params.softmax_stats_ptr = softmax_stats_d;
+  params.softmax_stats_stride_in_bytes = get_size_in_bytes(h * 2, DATA_TYPE_FP32);
+
+  // Set the dimensions.
+  params.b = b;
+  params.h = h;
+  params.s = s_q;
+  params.d = d;
+  params.dv = dv;
+  params.num_grouped_heads = num_grouped_heads;
+  params.sliding_window_size = sliding_window_size;
+  assert((chunked_attention_size == 0 ||
+          (chunked_attention_size & (chunked_attention_size - 1)) == 0) &&
+         "chunked_attention_size has to be a power of 2");
+  params.log2_chunked_attention_size =
+      chunked_attention_size > 0 ? std::log2(chunked_attention_size) : 0;
+
+  // cumulative q or kv sequence lengths.
+  params.cu_q_seqlens = static_cast<int*>(cu_q_seqlens_d);
+  params.cu_kv_seqlens = static_cast<int*>(cu_kv_seqlens_d);
+  // cumulative mask sequence lengths.
+  params.cu_mask_rows = static_cast<int*>(cu_mask_rows_d);
+
+  // Set the different scale values.
+  Data_type scale_type1 =
+      (data_type == DATA_TYPE_FP16) || (data_type == DATA_TYPE_BF16) ? acc_type : DATA_TYPE_FP32;
+  Data_type scale_softmax_type = scale_type1;
+  Data_type scale_type2 =
+      (data_type == DATA_TYPE_FP16) || (data_type == DATA_TYPE_BF16) ? data_type : DATA_TYPE_FP32;
+  if (data_type == DATA_TYPE_E4M3) {
+    scale_type1 = acc_type;
+    scale_type2 = acc_type;
+  }
+
+  // Fuse 1.0f / softcapping_scale into scale_bmm1.
+  bool const enable_attn_logit_softcapping = softcapping_scale_bmm1 != 0.f;
+  float fused_scale_bmm1 =
+      enable_attn_logit_softcapping ? scale_bmm1 / softcapping_scale_bmm1 : scale_bmm1;
+
+  // use specialized hopper kernels without alibi support.
+  // alibi or softcapping_scale cannot utilize the exp2f with fused_scale optimization.
+  if (launch_params.warp_specialization && !has_alibi && !enable_attn_logit_softcapping) {
+    set_alpha(params.scale_bmm1, fused_scale_bmm1 * float(M_LOG2E), DATA_TYPE_FP32);
+  } else {
+    set_alpha(params.scale_bmm1, fused_scale_bmm1, scale_type1);
+  }
+  set_alpha(params.scale_softmax, scale_softmax, scale_softmax_type);
+  set_alpha(params.scale_bmm2, scale_bmm2, scale_type2);
+  params.scale_bmm2_d = reinterpret_cast<uint32_t*>(scale_bmm2_d);
+  params.softcapping_scale_bmm1 = softcapping_scale_bmm1;
+
+  FMHA_CHECK_CUDA(cudaMemcpy(params.scale_bmm2_d, &params.scale_bmm2, sizeof(uint32_t),
+                             cudaMemcpyHostToDevice));
+
+  // attention type, h_kv < h if MQA or GQA
+  params.h_kv = h_kv;
+  assert(h % h_kv == 0 && "MQA/GQA needs h to be divisible by h_kv!");
+  params.h_q_per_kv = h / h_kv;
+  params.has_alibi = has_alibi;
+  params.alibi_params = fmha::AlibiParams(h);
+
+  // Set flags
+  params.is_s_padded = is_s_padded;
+  params.use_int8_scale_max = use_int8_scale_max;
+
+  // Do we enable the trick to replace I2F with FP math in the 2nd GEMM?
+  if (data_type == DATA_TYPE_INT8) {
+    params.enable_i2f_trick = -double(1 << 22) * double(scale_bmm2) <= -128.f &&
+                              double(1 << 22) * double(scale_bmm2) >= 127.f;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void determine_launch_params(
+    Launch_params& launch_params, Data_type data_type, int sm, const size_t s, const size_t d,
+    const Attention_mask_type attention_mask_type, const Attention_input_layout input_layout,
+    bool const interleaved, bool const ignore_b1opt, bool const force_unroll, bool const use_tma,
+    bool const force_non_flash_attention, bool const force_non_warp_specialization,
+    bool const force_non_granular_tiling, bool const force_fp32_acc,
+    // device props
+    const cudaDeviceProp props) {
+  // Set launch params to choose kernels
+  launch_params.ignore_b1opt = ignore_b1opt;
+  launch_params.force_unroll = force_unroll;
+  launch_params.force_fp32_acc = force_fp32_acc;
+  launch_params.interleaved = interleaved;
+  launch_params.attention_mask_type = attention_mask_type;
+  launch_params.attention_input_layout = input_layout;
+
+  // Set SM count and L2 cache size (used to determine launch blocks/grids to maximum performance)
+  launch_params.multi_processor_count = props.multiProcessorCount;
+  launch_params.device_l2_cache_size = props.l2CacheSize;
+
+  // threshold for adopting flash attention or warp_specialized kernels.
+  launch_params.flash_attention =
+      (data_type == DATA_TYPE_FP16 || data_type == DATA_TYPE_BF16 || data_type == DATA_TYPE_E4M3) &&
+      (s >= 16 && d >= 16) && !force_non_flash_attention;
+
+  // enable warp_speialized kernels when s >= 512 on hopper
+  // note that warp_speialized kernels need flash attention + tma
+  launch_params.warp_specialization =
+      (data_type == DATA_TYPE_FP16 || data_type == DATA_TYPE_BF16 || data_type == DATA_TYPE_E4M3) &&
+      sm == 90 && launch_params.flash_attention && !force_non_warp_specialization;
+  // warp specialization kernels on hopper need tma
+  launch_params.use_tma = use_tma || launch_params.warp_specialization;
+
+  // use granular tiling on Ampere-style flash attention
+  launch_params.use_granular_tiling = !force_non_granular_tiling && launch_params.flash_attention &&
+                                      !launch_params.warp_specialization && sm >= 80;
+
+  if (launch_params.use_granular_tiling && (data_type == DATA_TYPE_E4M3 && sm == 80)) {
+    printf(
+        "Fallback to non-granular-tiling kernels as tiled e4m3 kernels"
+        "are not supported on Ada currently.\n");
+    launch_params.use_granular_tiling = false;
+  }
+}
+
+/**
+ * @brief TVM FFI binding for TRTLLM FMHA V2 kernel for MLA attention
+ *
+ * This function calls a specific TRTLLM kernel variant with:
+ * - Input type: E4M3 (8-bit floating point)
+ * - Accumulator type: FP32
+ * - Warp configuration: 64x64
+ * - Layout: Separate Q, K, V tensors
+ * - Q/K dimension: 192, V dimension: 128 (MLA-specific)
+ * - Output type: BF16
+ * - Target architecture: SM120 (Blackwell)
+ *
+ * @param q Query tensor [batch, q_seqlen, num_heads, 192] in E4M3
+ * @param k Key tensor [batch, kv_seqlen, num_kv_heads, 192] in E4M3
+ * @param v Value tensor [batch, kv_seqlen, num_kv_heads, 128] in E4M3
+ * @param o Output tensor [batch, q_seqlen, num_heads, 128] in BF16
+ * @param maybe_lse Optional log-sum-exp tensor for softmax statistics
+ * @param num_heads Number of query heads
+ * @param head_dim Head dimension (must be 192 for this kernel)
+ * @param seq_len Sequence length (not used, extracted from tensor shapes)
+ * @param scale_softmax Softmax scale factor
+ * @param scale_bmm1 Scale factor for the first GEMM
+ * @param scale_bmm2 Scale factor for the second GEMM
+ * @param is_e4m3 Whether the input is E4M3
+ * @param is_bf16_output Whether the output is BF16
+ */
+void TRTLLMFMHAv2Run(TensorView q, TensorView k, TensorView v, TensorView o,
+                     Optional<TensorView> maybe_lse, int64_t num_heads, int64_t head_dim,
+                     int64_t seq_len, const float scale_softmax, const float scale_bmm1,
+                     const float scale_bmm2, bool is_e4m3, bool is_bf16_output) {
+  const int batch_size = q.shape()[0];
+  // q,k,v seqlen all equal
+  const int q_seqlen = q.shape()[1];
+  const int kv_seqlen = k.shape()[1];
+  // num_heads
+  assert(num_heads == q.shape()[2] &&
+         "num_heads must be equal to the number of heads in the query tensor");
+  const int num_kv_heads = k.shape()[2];
+
+  // head_dim_qk
+  assert(head_dim == q.shape()[3] &&
+         "head_dim must be equal to the head dimension in the query tensor");
+  // head_dim_v
+  const int head_dim_v = v.shape()[3];  // Should be 128
+
+  Data_type data_type = is_e4m3 ? DATA_TYPE_E4M3 : DATA_TYPE_BF16;
+  Data_type acc_type = DATA_TYPE_FP32;
+  Data_type output_dtype = is_bf16_output ? DATA_TYPE_BF16 : DATA_TYPE_FP16;
+  Attention_mask_type attention_mask_type = Attention_mask_type::CAUSAL;
+  Attention_input_layout input_layout = Attention_input_layout::SEPARATE_Q_K_V;
+
+  CudaDevice device;
+  int sm = device.sm;
+  cudaDeviceProp props = device.props;
+
+  cudaStream_t stream = static_cast<cudaStream_t>(get_stream(q.device()));
+
+  Launch_params launch_params;
+  determine_launch_params(launch_params, data_type, sm, q_seqlen, head_dim, attention_mask_type,
+                          input_layout,
+                          false,  // interleaved
+                          false,  // ignore_b1opt
+                          false,  // force_unroll
+                          false,  // use_tma (let determine_launch_params decide)
+                          false,  // force_non_flash_attention
+                          true,   // force_non_warp_specialization (for non-SM90)
+                          false,  // force_non_granular_tiling
+                          true,   // force_fp32_acc
+                          props);
+
+  launch_params.total_q_seqlen = q_seqlen;
+  launch_params.total_kv_seqlen = kv_seqlen;
+
+  // device memory for scale_bmm2
+  void* scale_bmm2_d;
+  FMHA_CHECK_CUDA(cudaMalloc(&scale_bmm2_d, sizeof(uint32_t)));
+
+  // - Cumulative sequence lengths
+  std::vector<uint32_t> cu_seqlens(batch_size + 1);
+  for (int i = 0; i <= batch_size; i++) {
+    cu_seqlens[i] = i * q_seqlen;
+  }
+  void* cu_seqlens_d;
+  FMHA_CHECK_CUDA(cudaMalloc(&cu_seqlens_d, sizeof(uint32_t) * cu_seqlens.size()));
+  FMHA_CHECK_CUDA(cudaMemcpy(cu_seqlens_d, cu_seqlens.data(), sizeof(uint32_t) * cu_seqlens.size(),
+                             cudaMemcpyHostToDevice));
+  // LSE buffer has shape [batch_size, seq_len, num_heads, 2] for (max, lse)
+  if (maybe_lse.has_value()) {
+    FMHA_CHECK_CUDA(cudaMemset(maybe_lse.value().data_ptr(), 0,
+                               sizeof(float) * batch_size * q_seqlen * num_heads * 2));
+  }
+
+  bert::Fused_multihead_attention_params_v2 params;
+
+  set_params(params, launch_params, data_type, acc_type, output_dtype, input_layout,
+             batch_size,         // b
+             q_seqlen,           // s_q
+             kv_seqlen,          // s_kv
+             num_heads,          // h
+             num_kv_heads,       // h_kv
+             head_dim,           // d
+             head_dim_v,         // dv
+             cu_seqlens.back(),  // total tokens
+             1,                  // num_grouped_heads (not used for regular attention)
+             INT_MAX,            // sliding_window_size (disabled)
+             0,                  // chunked_attention_size (disabled)
+             64,                 // tokens_per_block (not used with SEPARATE_Q_K_V)
+             nullptr,            // qkv_packed_d (not used)
+             q.data_ptr(),       // q_d
+             k.data_ptr(),       // k_d
+             v.data_ptr(),       // v_d
+             nullptr,            // kv_d (not used)
+             nullptr,            // paged_kv_pool_ptr (not used)
+             nullptr,            // paged_block_offsets (not used)
+             nullptr,            // packed_mask_d (not used for causal)
+             nullptr,            // cu_mask_rows_d (not used)
+             nullptr,            // attention_sinks_d (not used)
+             cu_seqlens_d,       // cu_kv_seqlens_d
+             cu_seqlens_d,       // cu_q_seqlens_d (same as kv for equal lengths)
+             o.data_ptr(),       // o_packed_d
+             nullptr,            // p_d (not storing)
+             nullptr,            // s_d (not storing)
+             maybe_lse.has_value() ? maybe_lse.value().data_ptr() : nullptr, scale_bmm2_d,
+             scale_bmm1,     // scale_bmm1
+             scale_softmax,  // scale_softmax
+             scale_bmm2,     // scale_bmm2
+             0.0f,           // softcapping_scale_bmm1 (disabled)
+             false,          // use_int8_scale_max
+             false,          // interleaved
+             false,          // is_s_padded
+             false);         // has_alibi
+
+  if (data_type == DATA_TYPE_E4M3 && output_dtype == DATA_TYPE_BF16) {
+    run_fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_k_v_192x128_output_bf16_sm120_nl_tiled(
+        params, launch_params, stream);
+  } else if (data_type == DATA_TYPE_BF16) {
+    run_fmha_v2_flash_attention_bf16_64_128_S_q_k_v_192x128_sm120_nl_tiled(params, launch_params,
+                                                                           stream);
+  } else if (data_type == DATA_TYPE_E4M3 && acc_type == DATA_TYPE_FP32) {
+    run_fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_k_v_192x128_sm120_nl_tiled(
+        params, launch_params, stream);
+  } else {
+    throw std::runtime_error("Unsupported data type");
+  }
+  FMHA_CHECK_CUDA(cudaFree(scale_bmm2_d));
+  FMHA_CHECK_CUDA(cudaFree(cu_seqlens_d));
+}
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(run, flashinfer::TRTLLMFMHAv2Run);
+
+}  // namespace flashinfer
diff --git a/csrc/trtllm_fused_moe_dev_kernel.cu b/csrc/trtllm_fused_moe_dev_kernel.cu
index f596d046b8..a19c89638d 100644
--- a/csrc/trtllm_fused_moe_dev_kernel.cu
+++ b/csrc/trtllm_fused_moe_dev_kernel.cu
@@ -19,6 +19,7 @@
 #include <cutlass/numeric_conversion.h>
 #include <cutlass/numeric_types.h>
 
+#include <algorithm>
 #include <cub/cub.cuh>
 #include <cuda/functional>
 #include <cuda/std/functional>
@@ -26,6 +27,7 @@
 
 #include "flashinfer/exception.h"
 #include "flashinfer/trtllm/fused_moe/DevKernel.h"
+#include "flashinfer/utils.cuh"
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -93,13 +95,120 @@ __global__ void activationKernel(KernelParams params) {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+struct Float4Max {
+  __device__ __forceinline__ float4 operator()(float4 const& a, float4 const& b) const {
+    float4 result;
+    result.x = fmaxf(a.x, b.x);
+    result.y = fmaxf(a.y, b.y);
+    result.z = fmaxf(a.z, b.z);
+    result.w = fmaxf(a.w, b.w);
+    return result;
+  }
+};
+
+struct Float2Max {
+  __device__ __forceinline__ float2 operator()(float2 const& a, float2 const& b) const {
+    float2 result;
+    result.x = fmaxf(a.x, b.x);
+    result.y = fmaxf(a.y, b.y);
+    return result;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename VecType, int size>
+__device__ __forceinline__ VecType packedTypeFromArray(float data[size]) {
+  return {};
+}
+
+template <>
+__device__ __forceinline__ float4 packedTypeFromArray<float4, 4>(float data[4]) {
+  float4 result;
+  result.x = data[0];
+  result.y = data[1];
+  result.z = data[2];
+  result.w = data[3];
+  return result;
+}
+
+template <>
+__device__ __forceinline__ float2 packedTypeFromArray<float2, 2>(float data[2]) {
+  float2 result;
+  result.x = data[0];
+  result.y = data[1];
+  return result;
+}
+
+template <>
+__device__ __forceinline__ float packedTypeFromArray<float, 1>(float data[1]) {
+  return data[0];
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename PackedType, int size>
+__device__ __forceinline__ cutlass::Array<float, size> arrayFromPackedType(PackedType data) {
+  return cutlass::Array<float, size>{};
+}
+
+template <>
+__device__ __forceinline__ cutlass::Array<float, 4> arrayFromPackedType<float4, 4>(float4 data) {
+  return cutlass::Array<float, 4>{data.x, data.y, data.z, data.w};
+}
+
+template <>
+__device__ __forceinline__ cutlass::Array<float, 2> arrayFromPackedType<float2, 2>(float2 data) {
+  return cutlass::Array<float, 2>{data.x, data.y};
+}
+
+template <>
+__device__ __forceinline__ cutlass::Array<float, 1> arrayFromPackedType<float, 1>(float data) {
+  return cutlass::Array<float, 1>{data};
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int NUM_TOKENS_PER_CTA>
+struct KernelTraits;
+
+template <>
+struct KernelTraits<4> {
+  using MaxOp = Float4Max;
+  using PackedType = float4;
+};
+
+template <>
+struct KernelTraits<2> {
+  using MaxOp = Float2Max;
+  using PackedType = float2;
+};
+
+template <>
+struct KernelTraits<1> {
+#if CUDA_VERSION >= 12090
+  using MaxOp = cuda::maximum<>;
+#else
+  using MaxOp = cub::Max;
+#endif
+  using PackedType = float;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+constexpr int DEEP_SEEK_ACTIVATION_NUM_THREADS_PER_CTA = 128;
+
 template <typename KernelParams>
 __global__ void activationDeepSeekKernel(KernelParams params) {
   using Type = typename KernelParams::Type;
-  using BlockReduce = cub::BlockReduce<float, 128>;
+  int32_t constexpr NumTokensPerCta = KernelParams::NumTokensPerCta;
+  using KernelTraits = KernelTraits<NumTokensPerCta>;
+  using MaxOp = typename KernelTraits::MaxOp;
+  using PackedType = typename KernelTraits::PackedType;
+  using BlockReduce = cub::BlockReduce<PackedType, DEEP_SEEK_ACTIVATION_NUM_THREADS_PER_CTA>;
 
-  __shared__ float s_scaleOut;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ float s_scaleOutArr[NumTokensPerCta];
+  __shared__ typename BlockReduce::TempStorage tempStorage;
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
   // immediately trigger the secondary kernel when using PDL, then wait on primary
@@ -108,54 +217,110 @@ __global__ void activationDeepSeekKernel(KernelParams params) {
     cudaGridDependencySynchronize();
   }
 #endif
-  // Loop over tokens
-  for (int tokenIdx = blockIdx.z; tokenIdx < params.numTokens; tokenIdx += gridDim.z) {
-    // Look over experts per token
-    for (int k = blockIdx.y; k < params.topK; k += gridDim.y) {
-      int const expandedIdx = tokenIdx * params.topK + k;
-      int const permutedIdx = params.expandedIdxToPermutedIdx[expandedIdx];
 
-      // Needed for expert parallelism
-      if (permutedIdx == -1) continue;
+  // The largest (finite) value that can be represented using E4m3.
+  float constexpr E4m3MaxVal{448.f};
 
-      // Loop over hidden dim
+  int const totalNumPaddedTokens = params.totalNumPaddedTokens[0];
+  // Loop over tokens
+  float scale1Arr[NumTokensPerCta];
+  float scale2Arr[NumTokensPerCta];
+  float dataX1Arr[NumTokensPerCta];
+  float dataX2Arr[NumTokensPerCta];
+  float outArr[NumTokensPerCta];
+  float absOutArr[NumTokensPerCta];
+  int permutedIdxArr[NumTokensPerCta];
+
+  // Loop over tokens
+  for (int k = blockIdx.z; k < params.topK; k += gridDim.z) {
+    for (int tokenCtaIdx = blockIdx.y * NumTokensPerCta; tokenCtaIdx < params.numTokens;
+         tokenCtaIdx += gridDim.y * NumTokensPerCta) {
       for (int hiddenIdx = threadIdx.x + blockDim.x * blockIdx.x; hiddenIdx < params.innerDim / 2;
            hiddenIdx += blockDim.x * gridDim.x) {
-        int const baseIdx = permutedIdx * params.innerDim + hiddenIdx;
-
-        int const totalNumPaddedTokens = params.totalNumPaddedTokens[0];
-
-        int const scale1_idx = permutedIdx + totalNumPaddedTokens * (hiddenIdx / 128);
-        int const scale2_idx =
-            permutedIdx + totalNumPaddedTokens * ((hiddenIdx / 128) + (params.innerDim / 2 / 128));
-        float const scale1 = params.inDqSfsPtr[scale1_idx];
-        float const scale2 = params.inDqSfsPtr[scale2_idx];
-
-        float x1 = scale1 * (float)params.inPtr[baseIdx];
-        float x2 = scale2 * (float)params.inPtr[baseIdx + params.innerDim / 2];
+#pragma unroll
+        for (int tokenInCtaIdx = 0; tokenInCtaIdx < NumTokensPerCta; tokenInCtaIdx++) {
+          scale1Arr[tokenInCtaIdx] = 0.0f;
+          scale2Arr[tokenInCtaIdx] = 0.0f;
+          dataX1Arr[tokenInCtaIdx] = 0.0f;
+          dataX2Arr[tokenInCtaIdx] = 0.0f;
+          outArr[tokenInCtaIdx] = 0.0f;
+          absOutArr[tokenInCtaIdx] = 0.0f;
+        }
+#pragma unroll
+        for (int tokenInCtaIdx = 0; tokenInCtaIdx < NumTokensPerCta; tokenInCtaIdx++) {
+          int const tokenIdx = tokenCtaIdx + tokenInCtaIdx;
+          if (tokenIdx >= params.numTokens) {
+            break;
+          }
 
-        float act = silu(x2);
-        float out = act * x1;
+          int const expandedIdx = tokenIdx * params.topK + k;
+          int const permutedIdx = params.expandedIdxToPermutedIdx[expandedIdx];
+          permutedIdxArr[tokenInCtaIdx] = permutedIdx;
+          if (permutedIdx == -1) {
+            continue;
+          }
+
+          // Process blocks for this CTA
+          int const baseIdx = permutedIdx * params.innerDim + hiddenIdx;
+
+          int const scale1Idx = permutedIdx + totalNumPaddedTokens * (hiddenIdx / 128);
+          int const scale2Idx = permutedIdx + totalNumPaddedTokens *
+                                                  ((hiddenIdx / 128) + (params.innerDim / 2 / 128));
+
+          scale1Arr[tokenInCtaIdx] = params.inDqSfsPtr[scale1Idx];
+          scale2Arr[tokenInCtaIdx] = params.inDqSfsPtr[scale2Idx];
+          dataX1Arr[tokenInCtaIdx] = static_cast<float>(params.inPtr[baseIdx]);
+          dataX2Arr[tokenInCtaIdx] =
+              static_cast<float>(params.inPtr[baseIdx + params.innerDim / 2]);
+        }
 
-        // The largest (finite) value that can be represented using E4m3.
-        float constexpr E4m3MaxVal{448.f};
+#pragma unroll
+        for (int tokenInCtaIdx = 0; tokenInCtaIdx < NumTokensPerCta; tokenInCtaIdx++) {
+          float x1 = scale1Arr[tokenInCtaIdx] * dataX1Arr[tokenInCtaIdx];
+          float x2 = scale2Arr[tokenInCtaIdx] * dataX2Arr[tokenInCtaIdx];
+          float act = silu(x2);
+          float out = act * x1;
+          outArr[tokenInCtaIdx] = out;
+          absOutArr[tokenInCtaIdx] = fabsf(out);
+        }
 
-        // Compute the absolute max
-#if CUDA_VERSION >= 12090
-        float aMax = BlockReduce(temp_storage).Reduce(fabsf(out), cuda::maximum<>{});
-#else
-        float aMax = BlockReduce(temp_storage).Reduce(fabsf(out), cub::Max{});
-#endif
-        if (threadIdx.x == 0) {
-          s_scaleOut = aMax / E4m3MaxVal;
-          int const scaleOut_idx = permutedIdx + totalNumPaddedTokens * (hiddenIdx / 128);
-          params.outDqSfsPtr[scaleOut_idx] = aMax / E4m3MaxVal;
+        auto absOutPacked = packedTypeFromArray<PackedType, NumTokensPerCta>(absOutArr);
+        auto aMaxPacked = BlockReduce(tempStorage).Reduce(absOutPacked, MaxOp{});
+        auto aMaxArr = arrayFromPackedType<PackedType, NumTokensPerCta>(aMaxPacked);
+
+#pragma unroll
+        for (int tokenInCtaIdx = 0; tokenInCtaIdx < NumTokensPerCta; tokenInCtaIdx++) {
+          if (threadIdx.x == 0) {
+            auto const tokenIdx = tokenCtaIdx + tokenInCtaIdx;
+            if (tokenIdx >= params.numTokens) {
+              break;
+            }
+            int const permutedIdx = permutedIdxArr[tokenInCtaIdx];
+            if (permutedIdx == -1) {
+              continue;
+            }
+            s_scaleOutArr[tokenInCtaIdx] = aMaxArr[tokenInCtaIdx] / E4m3MaxVal;
+            int const scaleOut_idx =
+                permutedIdxArr[tokenInCtaIdx] + totalNumPaddedTokens * (hiddenIdx / 128);
+            params.outDqSfsPtr[scaleOut_idx] = aMaxArr[tokenInCtaIdx] / E4m3MaxVal;
+          }
         }
         __syncthreads();
-        float const scaleOut = s_scaleOut;
-        __syncthreads();
-        int const outIdx = permutedIdx * (params.innerDim / 2) + hiddenIdx;
-        params.outPtr[outIdx] = (Type)(out / scaleOut);
+
+#pragma unroll
+        for (int tokenInCtaIdx = 0; tokenInCtaIdx < NumTokensPerCta; tokenInCtaIdx++) {
+          auto const tokenIdx = tokenCtaIdx + tokenInCtaIdx;
+          if (tokenIdx >= params.numTokens) {
+            break;
+          }
+          int const permutedIdx = permutedIdxArr[tokenInCtaIdx];
+          if (permutedIdx == -1) {
+            continue;
+          }
+          float const scaleOut = s_scaleOutArr[tokenInCtaIdx];
+          int const outIdx = permutedIdx * (params.innerDim / 2) + hiddenIdx;
+          params.outPtr[outIdx] = static_cast<Type>(outArr[tokenInCtaIdx] / scaleOut);
+        }
       }
     }
   }
@@ -172,15 +337,41 @@ void run(Data const& data, void* stream) {
   }
 
   if (data.mUseDeepSeekFp8) {
-    int const numThreads = 128;
-    const dim3 grid(data.innerDim / 128, data.topK, data.numTokens);
+    constexpr int NUM_ELTS_PER_LOAD = 1;
+    constexpr int NUM_ELTS_PER_SF = 128;
+
+    int device{-1};
+    cudaGetDevice(&device);
+    int numSms = 0;
+    cudaDeviceGetAttribute(&numSms, cudaDevAttrMultiProcessorCount, device);
+
+    // Output dimension is innerDim / 2, and each scale block is 128 elements
+    int const outputDim = data.innerDim / 2;
+    int const numScaleBlocks = (outputDim + NUM_ELTS_PER_SF - 1) / NUM_ELTS_PER_SF;
+    int const gridSizeX = (numScaleBlocks + NUM_ELTS_PER_LOAD - 1) / NUM_ELTS_PER_LOAD;
+
+    auto numCtas = gridSizeX * data.numTokens * data.topK;
+    // FIXME: This is heruistic based on very short benchmark.
+    int numTokensPerCta = 1;
+    if (numCtas > numSms * 32) {
+      numTokensPerCta = 4;
+    } else if (numCtas > numSms * 4) {
+      numTokensPerCta = 2;
+    } else {
+      numTokensPerCta = 1;
+    }
+
+    int const gridSizeY = std::min(8192, (data.numTokens + numTokensPerCta - 1) / numTokensPerCta);
+
+    const dim3 grid(gridSizeX, gridSizeY, data.topK);
 
-    LAUNCH(data, activationDeepSeekKernel, grid, numThreads, 0, stream);
+    LAUNCH_ACTIVATION(data, activationDeepSeekKernel, numTokensPerCta, grid,
+                      DEEP_SEEK_ACTIVATION_NUM_THREADS_PER_CTA, 0, stream);
   } else {
     int const numThreads = 256;
     const dim3 grid(data.innerDim / 128, data.topK, data.numTokens);
 
-    LAUNCH(data, activationKernel, grid, numThreads, 0, stream);
+    LAUNCH_ACTIVATION(data, activationKernel, 1, grid, numThreads, 0, stream);
   }
 }
 
@@ -491,11 +682,128 @@ __device__ float4 vectorizedLoadPtx(float4 const* ptr) {
 // Final kernel to unpermute and scale
 // This kernel unpermutes the original data, does the k-way reduction and performs the final skip
 // connection.
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+constexpr int MaxTopK = 64;
+
+typedef struct __CUDA_ALIGN__(4) {
+  cutlass::bfloat16_t array[2];
+} bfloat16_2;
+
+typedef struct __CUDA_ALIGN__(8) {
+  cutlass::bfloat16_t array[4];
+} bfloat16_4;
+
+typedef struct __CUDA_ALIGN__(8) {
+  half array[4];
+} half_4;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int UnrollFactor_, typename TypeExpW_>
+struct ScaleTraitsStruct;
+
+template <>
+struct ScaleTraitsStruct<1, cutlass::bfloat16_t> {
+  using PackedType = cutlass::bfloat16_t;
+  using ArrayType = cutlass::Array<cutlass::bfloat16_t, 1>;
+};
+
+template <>
+struct ScaleTraitsStruct<2, cutlass::bfloat16_t> {
+  using PackedType = bfloat16_2;
+  using ArrayType = cutlass::Array<cutlass::bfloat16_t, 2>;
+};
+
+template <>
+struct ScaleTraitsStruct<4, cutlass::bfloat16_t> {
+  using PackedType = bfloat16_4;
+  using ArrayType = cutlass::Array<cutlass::bfloat16_t, 4>;
+};
+
+template <>
+struct ScaleTraitsStruct<1, float> {
+  using PackedType = float;
+  using ArrayType = cutlass::Array<float, 1>;
+};
+
+template <>
+struct ScaleTraitsStruct<2, float> {
+  using PackedType = float2;
+  using ArrayType = cutlass::Array<float, 2>;
+};
+
+template <>
+struct ScaleTraitsStruct<4, float> {
+  using PackedType = float4;
+  using ArrayType = cutlass::Array<float, 4>;
+};
+
+template <>
+struct ScaleTraitsStruct<1, half> {
+  using PackedType = half;
+  using ArrayType = cutlass::Array<half, 1>;
+};
+
+template <>
+struct ScaleTraitsStruct<2, half> {
+  using PackedType = half2;
+  using ArrayType = cutlass::Array<half, 2>;
+};
+
+template <>
+struct ScaleTraitsStruct<4, half> {
+  using PackedType = half_4;
+  using ArrayType = cutlass::Array<half, 4>;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int UnrollFactor_, typename TypeExpW_>
+struct FinalizeTraits;
+
+template <typename TypeExpW_>
+struct FinalizeTraits<1, TypeExpW_> {
+  using IdxPackedType = int;
+  using IdxArrayType = cutlass::Array<int, 1>;
+  using ScaleTraits = ScaleTraitsStruct<1, TypeExpW_>;
+  using ScalePackedType = typename ScaleTraits::PackedType;
+  using ScaleArrayType = typename ScaleTraits::ArrayType;
+};
+
+template <typename TypeExpW_>
+struct FinalizeTraits<2, TypeExpW_> {
+  using IdxPackedType = int2;
+  using IdxArrayType = cutlass::Array<int, 2>;
+  using ScaleTraits = ScaleTraitsStruct<2, TypeExpW_>;
+  using ScalePackedType = typename ScaleTraits::PackedType;
+  using ScaleArrayType = typename ScaleTraits::ArrayType;
+};
+
+template <typename TypeExpW_>
+struct FinalizeTraits<4, TypeExpW_> {
+  using IdxPackedType = int4;
+  using IdxArrayType = cutlass::Array<int, 4>;
+  using ScaleTraits = ScaleTraitsStruct<4, TypeExpW_>;
+  using ScalePackedType = typename ScaleTraits::PackedType;
+  using ScaleArrayType = typename ScaleTraits::ArrayType;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename KernelParams>
 __global__ void finalizeKernelVecLoad(KernelParams params) {
   using Type = typename KernelParams::Type;
   using TypeExpW = typename KernelParams::TypeExpW;
+  int constexpr TopKUnrollFactor = KernelParams::TopKUnrollFactor;
+
+  static_assert(TopKUnrollFactor == 1 || TopKUnrollFactor == 2 || TopKUnrollFactor == 4,
+                "TopKUnrollFactor must be 1, 2, or 4");
+  using FinalizeTraits = FinalizeTraits<TopKUnrollFactor, TypeExpW>;
+  using IdxPackedType = typename FinalizeTraits::IdxPackedType;
+  using IdxArrayType = typename FinalizeTraits::IdxArrayType;
+  using ScalePackedType = typename FinalizeTraits::ScalePackedType;
+  using ScaleArrayType = typename FinalizeTraits::ScaleArrayType;
 
   int const hiddenDimPaddedBits = params.hiddenDimPadded * cutlass::sizeof_bits<Type>::value;
   int const hiddenDimBits = params.hiddenDim * cutlass::sizeof_bits<Type>::value;
@@ -513,6 +821,23 @@ __global__ void finalizeKernelVecLoad(KernelParams params) {
   int64_t const stride = FINALIZE_THREADS_PER_BLOCK;
   int64_t const numElemsInPaddedCol = params.hiddenDimPadded / FINALIZE_ELEM_PER_THREAD;
   int64_t const numElemsInCol = params.hiddenDim / FINALIZE_ELEM_PER_THREAD;
+  bool const useScale = params.expertWeightsPtr != nullptr;
+
+  __shared__ ScalePackedType scaleArrSmem[MaxTopK / TopKUnrollFactor];
+  __shared__ IdxPackedType permutedIdxArrSmem[MaxTopK / TopKUnrollFactor];
+
+  for (int kChunkIdx = threadIdx.x; kChunkIdx < params.topK / TopKUnrollFactor;
+       kChunkIdx += blockDim.x) {
+    int const expandedIdx = tokenIdx * params.topK + kChunkIdx * TopKUnrollFactor;
+    auto permutedIdxPacked = reinterpret_cast<IdxPackedType const*>(
+        params.expandedIdxToPermutedIdx)[expandedIdx / TopKUnrollFactor];
+    auto scalePacked = useScale ? reinterpret_cast<ScalePackedType const*>(
+                                      params.expertWeightsPtr)[expandedIdx / TopKUnrollFactor]
+                                : ScalePackedType{TypeExpW(1.f)};
+
+    scaleArrSmem[kChunkIdx] = scalePacked;
+    permutedIdxArrSmem[kChunkIdx] = permutedIdxPacked;
+  }
 
   auto const offset = tokenIdx * params.hiddenDim;
   Type* outputPtr = params.outPtr + offset;
@@ -525,31 +850,42 @@ __global__ void finalizeKernelVecLoad(KernelParams params) {
     cudaGridDependencySynchronize();
   }
 #endif
+  __syncthreads();
 
   for (int elemIndex = startOffset; elemIndex < numElemsInCol; elemIndex += stride) {
     ComputeElem threadOutput;
     threadOutput.fill(0);
-    for (int k = 0; k < params.topK; ++k) {
-      int const expandedIdx = tokenIdx * params.topK + k;
-      int const permutedIdx = params.expandedIdxToPermutedIdx[expandedIdx];
-      if (permutedIdx == -1) {
-        continue;
-      }
-
-      float const scale = (params.expertWeightsPtr != nullptr)
-                              ? static_cast<float>(params.expertWeightsPtr[expandedIdx])
-                              : 1.f;
+    for (int kChunkIdx = 0; kChunkIdx < params.topK / TopKUnrollFactor; kChunkIdx++) {
+      auto permutedIdxArr = *reinterpret_cast<IdxArrayType const*>(&permutedIdxArrSmem[kChunkIdx]);
+      InputElem inputElemArr[TopKUnrollFactor];
+#pragma unroll
+      for (int ki = 0; ki < TopKUnrollFactor; ++ki) {
+        auto const permutedIdx = permutedIdxArr[ki];
+        if (permutedIdx == -1) {
+          continue;
+        }
 
-      auto const* inputPermutedPtr = inElemPtr + permutedIdx * numElemsInPaddedCol;
+        auto const* inputPermutedPtr = inElemPtr + permutedIdx * numElemsInPaddedCol;
 
-      float4 input =
-          vectorizedLoadPtx(reinterpret_cast<float4 const*>(&inputPermutedPtr[elemIndex]));
-      InputElem inputPermutedElem = *reinterpret_cast<InputElem const*>(&input);
-      ComputeElem expertResult = arrayConvert<InputElem, ComputeElem>(inputPermutedElem);
+        float4 input =
+            vectorizedLoadPtx(reinterpret_cast<float4 const*>(&inputPermutedPtr[elemIndex]));
+        inputElemArr[ki] = *reinterpret_cast<InputElem const*>(&input);
+      }
+      auto scaleArr = *reinterpret_cast<ScaleArrayType const*>(&scaleArrSmem[kChunkIdx]);
+      auto const scaleFloatArr =
+          arrayConvert<ScaleArrayType, cutlass::Array<float, TopKUnrollFactor>>(scaleArr);
 
-      threadOutput = threadOutput + scale * expertResult;
+#pragma unroll
+      for (int ki = 0; ki < TopKUnrollFactor; ++ki) {
+        auto const permutedIdx = permutedIdxArr[ki];
+        if (permutedIdx == -1) {
+          continue;
+        }
+        auto scale = useScale ? scaleFloatArr[ki] : 1.0f;
+        ComputeElem expertResult = arrayConvert<InputElem, ComputeElem>(inputElemArr[ki]);
+        threadOutput = threadOutput + scale * expertResult;
+      }
     }
-
     OutputElem outputElem = arrayConvert<ComputeElem, OutputElem>(threadOutput);
     outElemPtr[elemIndex] = outputElem;
   }
@@ -632,7 +968,7 @@ void run(Data const& data, void* stream) {
     int const numBlocksY = std::min(8192, data.numTokens);
     dim3 numBlocks(numBlocksX, numBlocksY);
 
-    LAUNCH_EXPW(data, finalizeDeepSeekKernel, numBlocks, numThreads, 0, stream);
+    LAUNCH_TOPK_EXPW(data, finalizeDeepSeekKernel, numBlocks, numThreads, 0, stream);
   } else {
     int const numThreads = 256;
     int const numBlocksX = (data.hiddenDim - 1 + numThreads) / numThreads;
@@ -646,10 +982,14 @@ void run(Data const& data, void* stream) {
       // ensure that when the number of waves is greater than 1, we choose to use the kernel with
       // vectorized loading.
       dim3 numBlocks(numBlocksX, numBlocksY);
-      LAUNCH_EXPW(data, finalizeKernel, numBlocks, numThreads, 0, stream);
+      LAUNCH_TOPK_EXPW(data, finalizeKernel, numBlocks, numThreads, 0, stream);
     } else {
-      LAUNCH_EXPW(data, finalizeKernelVecLoad, /*numBlocks=*/data.numTokens,
-                  /*numThreads=*/FINALIZE_THREADS_PER_BLOCK, 0, stream);
+      FLASHINFER_CHECK(
+          data.topK <= MaxTopK,
+          "Finalize kernel with vectorized loading is not supported for this TopK value: %d",
+          data.topK);
+      LAUNCH_TOPK_EXPW(data, finalizeKernelVecLoad, /*numBlocks=*/data.numTokens,
+                       /*numThreads=*/FINALIZE_THREADS_PER_BLOCK, 0, stream);
     }
   }
 }
diff --git a/csrc/trtllm_fused_moe_kernel_launcher.cu b/csrc/trtllm_fused_moe_kernel_launcher.cu
index 22c1e8e51e..1cb789dde4 100644
--- a/csrc/trtllm_fused_moe_kernel_launcher.cu
+++ b/csrc/trtllm_fused_moe_kernel_launcher.cu
@@ -17,7 +17,11 @@
 
 #include <algorithm>
 #include <cmath>
+#include <cstring>
+#include <iomanip>
 #include <iostream>
+#include <set>
+#include <unordered_map>
 #include <vector>
 
 #include "flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/GemmGatedActOptions.h"
@@ -37,1056 +41,1610 @@ using tensorrt_llm::kernels::trtllmgen_moe::Routing::RoutingMethodType;
 using tvm::ffi::Array;
 using tvm::ffi::Optional;
 
-void trtllm_fp8_per_tensor_scale_moe_launcher(
-    TensorView routing_logits, Optional<TensorView> routing_bias, TensorView hidden_states,
-    TensorView gemm1_weights, TensorView output1_scales_scalar,
-    TensorView output1_scales_gate_scalar, TensorView gemm2_weights,
-    TensorView output2_scales_scalar, TensorView output, int64_t const num_experts,
-    int64_t const top_k, Optional<int64_t> const n_group, Optional<int64_t> const topk_group,
-    int64_t const intermediate_size, int64_t const local_expert_offset,
-    int64_t const local_num_experts, Optional<double> const routed_scaling_factor,
-    bool const use_routing_scales_on_input, int64_t const tile_tokens_dim,
-    int64_t const routing_method_type, bool enable_pdl) {
-  static const std::tuple<int, int> device_props = [hidden_states] {
-    int major, minor;
-    cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor,
-                           hidden_states.device().device_id);
-    cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor,
-                           hidden_states.device().device_id);
-    return std::make_tuple(major, minor);
-  }();
-
-  TVM_FFI_ICHECK_EQ(std::get<0>(device_props), 10)
-      << "This kernel requires 10.x architecture. Current device has SM "
-      << std::get<0>(device_props) << std::get<1>(device_props);
+// Utility function to compute the next power of two
+inline int32_t nextPowerOfTwo(float value) {
+  int32_t n = static_cast<int32_t>(std::ceil(value));
+  if (n <= 1) return 1;
 
-  if (use_routing_scales_on_input) {
-    TVM_FFI_ICHECK_EQ(routing_logits.dtype(), dl_bfloat16) << "routing_logits must be bfloat16.";
-  } else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::DeepSeekV3) {
-    TVM_FFI_ICHECK_EQ(routing_logits.dtype(), dl_float32) << "routing_logits must be float.";
-  } else {
-    TVM_FFI_ICHECK_EQ(routing_logits.dtype(), dl_bfloat16) << "routing_logits must be bfloat16.";
+  // If n is already a power of 2, return it
+  if ((n & (n - 1)) == 0) return n;
+
+  // Find the next power of 2
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+  n++;
+
+  return n;
+}
+
+std::set<int32_t> computeSelectedTileN(std::vector<int32_t> const& supported_tile_nums,
+                                       int64_t const num_tokens, int64_t const top_k,
+                                       int64_t const num_local_experts) {
+  float const avg_tokens_per_expert = static_cast<float>(num_tokens * top_k) / num_local_experts;
+  // assume supported_tile_nums is sorted
+  int32_t tile_tokens_dim = std::clamp(nextPowerOfTwo(avg_tokens_per_expert),
+                                       supported_tile_nums.front(), supported_tile_nums.back());
+  auto it = std::find(supported_tile_nums.begin(), supported_tile_nums.end(), tile_tokens_dim);
+
+  std::set<int32_t> selected_tile_nums;
+  selected_tile_nums.insert(tile_tokens_dim);
+  if (std::next(it) != supported_tile_nums.end()) {
+    selected_tile_nums.insert(*std::next(it));
+    if (std::next(std::next(it)) != supported_tile_nums.end()) {
+      selected_tile_nums.insert(*std::next(std::next(it)));
+    }
   }
-  TVM_FFI_ICHECK_EQ(routing_logits.ndim(), 2) << "routing_logits must be 2D.";
-  TVM_FFI_ICHECK_EQ(routing_logits.size(1), num_experts) << "routing_logits has incorrect shape.";
-  if (routing_bias.has_value()) {
-    TVM_FFI_ICHECK(routing_bias.value().dtype() == dl_bfloat16 ||
-                   routing_bias.value().dtype() == dl_float32)
-        << "routing_bias must be bfloat16 or float.";
-    TVM_FFI_ICHECK_EQ(routing_bias.value().ndim(), 1) << "routing_bias must be 1D.";
-    TVM_FFI_ICHECK_EQ(routing_bias.value().size(0), num_experts)
-        << "routing_bias has incorrect shape.";
+  if (it != supported_tile_nums.begin()) {
+    selected_tile_nums.insert(*std::prev(it));
   }
 
-  if (n_group.has_value() && n_group.value() != 0) {
-    TVM_FFI_ICHECK(static_cast<RoutingMethodType>(routing_method_type) ==
-                   RoutingMethodType::DeepSeekV3)
-        << "Routing kernel with groups implies DeepSeekV3 routing method.";
-    TVM_FFI_ICHECK(topk_group.has_value()) << "if n_group is given, topk_group must be given";
-    TVM_FFI_ICHECK_EQ(num_experts % n_group.value(), 0)
-        << "num_experts must be divisible by n_group";
-    TVM_FFI_ICHECK(top_k <= 8 && top_k > 0)
-        << "Current routing kernel (with groups) only supports top_k<=8 && top_k>0.";
-    TVM_FFI_ICHECK(topk_group.value() <= 4 && topk_group.value() > 0)
-        << "Current routing kernel only (with groups) supports topk_group<=4 && topk_group > 0.";
-    TVM_FFI_ICHECK_LE(topk_group.value(), n_group.value())
-        << "n_group must not be smaller than topk_group.";
-    // This check ensures we have enough experts in the selected groups to handle the top_k routing
-    TVM_FFI_ICHECK_LT(top_k, (topk_group.value() * num_experts / n_group.value()))
-        << "top_k must be less than total number of experts in selected groups";
-  } else if (static_cast<RoutingMethodType>(routing_method_type) ==
-                 RoutingMethodType::Renormalize ||
-             static_cast<RoutingMethodType>(routing_method_type) ==
-                 RoutingMethodType::RenormalizeNaive) {
-    TVM_FFI_LOG_AND_THROW(NotImplementedError)
-        << "Don't support routing method type Renormalize(Naive).";
-  } else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Llama4) {
-    TVM_FFI_ICHECK_EQ(top_k, 1)
-        << "Current routing kernel (no groups, Llama4) only supports top_k=1.";
-  }
-  TVM_FFI_ICHECK_EQ(num_experts % 4, 0)
-      << "Routing kernel expects that num_experts must be divisible by 4";
-  TVM_FFI_ICHECK_GT(num_experts, top_k) << "num_experts must be greater than top_k";
-  TVM_FFI_ICHECK_LE(local_num_experts + local_expert_offset, num_experts)
-      << "num_experts must be greater or equal to local_num_experts + local_expert_offset";
+  return selected_tile_nums;
+}
 
-  tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs args;
+class FusedMoeLauncher {
+ protected:
+  Optional<TensorView> routing_logits;
+  Optional<TensorView> routing_bias;
+  TensorView hidden_states;
+  TensorView gemm1_weights;
+  Optional<TensorView> output1_scales_scalar;
+  Optional<TensorView> output1_scales_gate_scalar;
+  TensorView gemm2_weights;
+  Optional<TensorView> output2_scales_scalar;
+
+  int64_t tile_tokens_dim{};
+  int64_t routing_method_type{};
+  bool use_shuffled_weight{};
+  batchedGemm::gemm::MatrixLayout weight_layout{batchedGemm::gemm::MatrixLayout::MajorK};
+
+  std::tuple<int, int> device_version;
+  std::unique_ptr<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs> args;
   tensorrt_llm::kernels::trtllmgen_moe::MoE::MoEWorkspace workspace;
 
-  // Convert PyTorch dtype to TensorRT-LLM dtype
-  auto dtype = hidden_states.dtype();
-  if (dtype == dl_float16) {
-    args.mDtypeElt = btg::Dtype::Fp16;
-  } else if (dtype == dl_bfloat16) {
-    args.mDtypeElt = btg::Dtype::Bfloat16;
-  } else if (dtype == dl_float8_e4m3fn) {
-    args.mDtypeElt = btg::Dtype::E4m3;
-  } else {
-    TVM_FFI_LOG_AND_THROW(NotImplementedError) << "Unsupported input dtype for MoE.";
+  btg::Dtype mDtypeAct{btg::Dtype::Bfloat16};
+  btg::Dtype mDtypeWeights{btg::Dtype::Bfloat16};
+  btg::Dtype mRoutingBiasDtype{
+      btg::Dtype::Bfloat16};  // Dtype for expert weights in routing, based on routing bias
+  GatedActType gated_act_type{GatedActType::SwiGlu};
+
+ public:
+  // Constructor that initializes all TensorView members
+  FusedMoeLauncher(const Optional<TensorView>& routing_logits,
+                   const Optional<TensorView>& routing_bias, const TensorView& hidden_states,
+                   const TensorView& gemm1_weights,
+                   const Optional<TensorView>& output1_scales_scalar,
+                   const Optional<TensorView>& output1_scales_gate_scalar,
+                   const TensorView& gemm2_weights,
+                   const Optional<TensorView>& output2_scales_scalar)
+      : routing_logits(routing_logits),
+        routing_bias(routing_bias),
+        hidden_states(hidden_states),
+        gemm1_weights(gemm1_weights),
+        output1_scales_scalar(output1_scales_scalar),
+        output1_scales_gate_scalar(output1_scales_gate_scalar),
+        gemm2_weights(gemm2_weights),
+        output2_scales_scalar(output2_scales_scalar),
+        tile_tokens_dim{},
+        routing_method_type{},
+        use_shuffled_weight{},
+        weight_layout{batchedGemm::gemm::MatrixLayout::MajorK},
+        mDtypeAct{btg::Dtype::Bfloat16},
+        mDtypeWeights{btg::Dtype::Bfloat16},
+        gated_act_type{GatedActType::SwiGlu} {}
+
+ protected:
+  // Initialize common data necessary for later.
+  // May throw exception from TVM_FFI_ICHECK.
+  void init_common(std::unique_ptr<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>&& args,
+                   int64_t tile_tokens_dim, int64_t routing_method_type, bool use_shuffled_weight,
+                   int64_t weight_layout, int64_t gated_act_type);
+
+  // Routing logits [num_tokens, num_experts]
+  void check_routing_logits_shape() const {
+    if (routing_logits.has_value()) {
+      TVM_FFI_ICHECK_EQ(routing_logits.value().ndim(), 2) << "routing_logits must be 2D.";
+      TVM_FFI_ICHECK_EQ(routing_logits.value().size(0), hidden_states.size(0))
+          << "routing_logits and hidden_states must have the same number of tokens.";
+      TVM_FFI_ICHECK_EQ(routing_logits.value().size(1), args->num_experts)
+          << "routing_logits dim1 must match num_experts.";
+    }
   }
 
-  args.routing_logits = routing_logits.data_ptr();
-  auto const routing_bias_dtype =
-      routing_bias.has_value() ? routing_bias.value().dtype() : dl_bfloat16;
-  auto btg_routing_bias_dtype = btg::Dtype::Fp32;
-  if (routing_bias_dtype == dl_bfloat16) {
-    btg_routing_bias_dtype = btg::Dtype::Bfloat16;
+  // Routing bias [num_experts]
+  void check_routing_bias_shape() const {
+    if (routing_bias.has_value()) {
+      TVM_FFI_ICHECK_EQ(routing_bias.value().ndim(), 1) << "routing_bias must be 1D.";
+      TVM_FFI_ICHECK_EQ(routing_bias.value().size(0), args->num_experts)
+          << "routing_bias has incorrect shape.";
+    }
   }
-  args.routing_bias = routing_bias.has_value() ? routing_bias.value().data_ptr() : nullptr;
-  args.hidden_states = hidden_states.data_ptr();
-  args.gemm1_weights = gemm1_weights.data_ptr();
-  args.output1_scales_scalar = static_cast<float*>(output1_scales_scalar.data_ptr());
-  args.output1_scales_gate_scalar = static_cast<float*>(output1_scales_gate_scalar.data_ptr());
-  args.gemm2_weights = gemm2_weights.data_ptr();
-  args.output2_scales_scalar = static_cast<float*>(output2_scales_scalar.data_ptr());
-  args.num_tokens = hidden_states.size(0);
-  args.num_experts = num_experts;
-  args.hidden_size = hidden_states.size(1);
-  args.hidden_size_output = args.hidden_size;
-  args.top_k = top_k;
-  args.n_group = n_group.has_value() ? n_group.value() : 0;
-  args.topk_group = topk_group.has_value() ? topk_group.value() : 0;
-  args.local_expert_offset = local_expert_offset;
-  args.local_num_experts = local_num_experts;
-  args.routed_scaling_factor =
-      routed_scaling_factor.has_value() ? routed_scaling_factor.value() : 1.0;
-  args.intermediate_size = intermediate_size;
-  args.mUseRoutingScalesOnInput = use_routing_scales_on_input;
-
-  // allocate workspace for routing kernel
-  Tensor num_tokens_per_expert = alloc_tensor({num_experts}, dl_int32, routing_logits.device());
-  int32_t max_num_padded_tokens =
-      tensorrt_llm::kernels::trtllmgen_moe::Routing::getMaxPermutedPaddedCount(
-          args.num_tokens, top_k, num_experts, tile_tokens_dim);
-  Tensor total_num_padded_tokens = alloc_tensor({1}, dl_int32, routing_logits.device());
-  Tensor expanded_idx_to_permuted_idx =
-      alloc_tensor({args.num_tokens * args.top_k}, dl_int32, routing_logits.device());
-  Tensor permuted_idx_to_token_idx =
-      alloc_tensor({max_num_padded_tokens}, dl_int32, routing_logits.device());
-  Tensor expert_weights =
-      alloc_tensor({args.num_tokens, args.top_k}, dl_bfloat16, routing_logits.device());
-  Tensor expert_indexes =
-      alloc_tensor({args.num_tokens, args.top_k}, dl_int32, routing_logits.device());
-  int64_t const size_of_expert_count_histogram = std::max(num_experts * 2, int64_t(256 * 2));
-  Tensor expert_count_histogram = alloc_tensor(
-      {size_of_expert_count_histogram},
-      dl_int32,  // 256 is the max number of threads per block and max number of experts
-      routing_logits.device());
-
-  // allocate workspace for activation/gemm/finalize kernels
-  // Tensor gemm1_output = alloc_tensor({max_num_padded_tokens, 2 * intermediate_size},
-  //                                    dl_float8_e4m3fn, hidden_states.device());
-  // Tensor activation_output = alloc_tensor({max_num_padded_tokens, intermediate_size},
-  //                                         dl_float8_e4m3fn, hidden_states.device());
-  Tensor gemm1_output = alloc_tensor({max_num_padded_tokens, 2 * intermediate_size}, dl_uint8,
-                                     hidden_states.device());
-  Tensor gemm1_output_scale = alloc_tensor({2 * intermediate_size / 128, max_num_padded_tokens},
-                                           dl_float32, hidden_states.device());
-  Tensor activation_output =
-      alloc_tensor({max_num_padded_tokens, intermediate_size}, dl_uint8, hidden_states.device());
-  Tensor activation_output_scale = alloc_tensor({intermediate_size / 128, max_num_padded_tokens},
-                                                dl_float32, hidden_states.device());
-  Tensor gemm2_output =
-      alloc_tensor({max_num_padded_tokens, args.hidden_size}, dl_bfloat16, hidden_states.device());
-
-  int32_t max_num_ctas = tensorrt_llm::kernels::trtllmgen_moe::Routing::getMaxNumCtasInBatchDim(
-      args.num_tokens, args.top_k, args.num_experts, tile_tokens_dim);
-  Tensor cta_idx_xy_to_batch_idx = alloc_tensor({max_num_ctas}, dl_int32, routing_logits.device());
-  Tensor cta_idx_xy_to_mn_limit = alloc_tensor({max_num_ctas}, dl_int32, routing_logits.device());
-  Tensor num_non_exiting_ctas = alloc_tensor({1}, dl_int32, routing_logits.device());
-
-  tensorrt_llm::kernels::trtllmgen_moe::Routing::Runner routing_runner(tile_tokens_dim);
-  cudaStream_t stream = get_stream(routing_logits.device());
-  routing_runner.run(
-      routing_logits.data_ptr(), args.routing_bias, args.num_tokens, args.num_experts, args.top_k,
-      args.n_group, args.topk_group, args.local_expert_offset, args.local_num_experts,
-      args.routed_scaling_factor, static_cast<int*>(expert_indexes.data_ptr()),
-      static_cast<int*>(expert_count_histogram.data_ptr()),
-      static_cast<int*>(total_num_padded_tokens.data_ptr()),
-      static_cast<int*>(expanded_idx_to_permuted_idx.data_ptr()),
-      nullptr /*static_cast<int*>(permuted_idx_to_expanded_idx.data_ptr())*/,
-      static_cast<int*>(permuted_idx_to_token_idx.data_ptr()), expert_weights.data_ptr(),
-      static_cast<int*>(num_tokens_per_expert.data_ptr()),
-      static_cast<int*>(cta_idx_xy_to_batch_idx.data_ptr()),
-      static_cast<int*>(cta_idx_xy_to_mn_limit.data_ptr()),
-      static_cast<int*>(num_non_exiting_ctas.data_ptr()), args.mDtypeElt, btg_routing_bias_dtype,
-      use_routing_scales_on_input, false /* use_deep_seek_fp8 */,
-      static_cast<RoutingMethodType>(routing_method_type), stream);
-
-  // MoE kernel except routing
-  TVM_FFI_ICHECK_EQ(hidden_states.dtype(), dl_float8_e4m3fn) << "hidden_states must be fp8.";
-  TVM_FFI_ICHECK_EQ(gemm1_weights.dtype(), dl_float8_e4m3fn) << "gemm1_weights must be fp8.";
-  TVM_FFI_ICHECK_EQ(gemm1_weights.ndim(), 3) << "gemm1_weights must be 3D.";
-  TVM_FFI_ICHECK_EQ(gemm1_weights.size(1) % 2, 0)
-      << "the second dimension of weights must be even.";
-  TVM_FFI_ICHECK_EQ(intermediate_size, gemm1_weights.size(1) / 2)
-      << "intermediate_size has incorrect shape.";
-  TVM_FFI_ICHECK_EQ(gemm1_weights.size(2), hidden_states.size(1))
-      << "the third dimension of weights must be equal to hidden_size.";
-  TVM_FFI_ICHECK_EQ(intermediate_size % 128, 0)
-      << "the second dimension of weights must be a multiple of 128.";
 
-  TVM_FFI_ICHECK_EQ(output1_scales_scalar.dtype(), dl_float32)
-      << "output1_scales_scalar must be float.";
-  TVM_FFI_ICHECK_EQ(output1_scales_scalar.ndim(), 1) << "output1_scales_scalar must be 1D.";
-  TVM_FFI_ICHECK_EQ(output1_scales_scalar.size(0), local_num_experts)
-      << "output1_scales_scalar has incorrect dim 0.";
-  TVM_FFI_ICHECK_EQ(output1_scales_gate_scalar.dtype(), dl_float32)
-      << "output1_scales_gate_scalar must be float.";
-  TVM_FFI_ICHECK_EQ(output1_scales_gate_scalar.ndim(), 1)
-      << "output1_scales_gate_scalar must be 1D.";
-  TVM_FFI_ICHECK_EQ(output1_scales_gate_scalar.size(0), local_num_experts)
-      << "output1_scales_gate_scalar has incorrect dim 0.";
+  // Hidden states [num_tokens, hidden_size]
+  void check_hidden_states_shape() const {
+    TVM_FFI_ICHECK_EQ(hidden_states.ndim(), 2) << "hidden_states must be 2D.";
+    TVM_FFI_ICHECK_EQ(hidden_states.size(1), args->intermediate_size)
+        << "hidden_states has incorrect shape.";
+  }
 
-  TVM_FFI_ICHECK_EQ(gemm2_weights.dtype(), dl_float8_e4m3fn) << "gemm2_weights must be fp8.";
-  TVM_FFI_ICHECK_EQ(gemm2_weights.ndim(), 3) << "gemm2_weights must be 3D.";
-  TVM_FFI_ICHECK_EQ(gemm2_weights.size(2), intermediate_size)
-      << "the third dimension of weights must be equal to intermediate_size.";
+  // GEMM1 or GEMM2 weights [num_experts, M, K] or [num_experts, K/block_k, M, block_k]
+  void check_weights_shape(std::string which_weights) const {
+    TensorView weights = (which_weights == "gemm1") ? gemm1_weights : gemm2_weights;
+    if (which_weights != "gemm1" && which_weights != "gemm2") {
+      TVM_FFI_LOG_AND_THROW(InternalError) << "Internal error: which_weights = " << which_weights;
+    }
 
-  TVM_FFI_ICHECK_EQ(output2_scales_scalar.dtype(), dl_float32)
-      << "output2_scales_scalar must be float.";
-  TVM_FFI_ICHECK_EQ(output2_scales_scalar.ndim(), 1) << "output2_scales_scalar must be 1D.";
-  TVM_FFI_ICHECK_EQ(output2_scales_scalar.size(0), local_num_experts)
-      << "output2_scales_scalar has incorrect dim 0.";
-
-  // allocate output
-  TVM_FFI_ICHECK_EQ(output.size(0), args.num_tokens);
-  TVM_FFI_ICHECK_EQ(output.size(1), args.hidden_size);
-  CHECK_INPUT_TYPE(output, dl_bfloat16);
-  CHECK_DEVICE(output, hidden_states);
-
-  // setup workspace
-  workspace.total_num_padded_tokens = static_cast<int*>(total_num_padded_tokens.data_ptr());
-  workspace.total_max_padded_tokens = max_num_padded_tokens;
-  workspace.ProjUpTileN = tile_tokens_dim;
-  workspace.routing_expert_indexes = static_cast<int*>(expert_indexes.data_ptr());
-  workspace.permuted_idx_size = static_cast<int*>(total_num_padded_tokens.data_ptr());
-  workspace.expanded_idx_to_permuted_idx = static_cast<int*>(
-      expanded_idx_to_permuted_idx.data_ptr());  // Needed by activation/finalize kernels
-  workspace.permuted_idx_to_token_idx =
-      static_cast<int*>(permuted_idx_to_token_idx.data_ptr());  // Needed by permuteGemm1 kernel
-  workspace.expert_weights = expert_weights.data_ptr();         // Consumed by finalize kernel
-
-  workspace.cta_idx_xy_to_batch_idx = static_cast<int*>(cta_idx_xy_to_batch_idx.data_ptr());
-  workspace.cta_idx_xy_to_mn_limit = static_cast<int*>(cta_idx_xy_to_mn_limit.data_ptr());
-  workspace.num_non_exiting_ctas = static_cast<int*>(num_non_exiting_ctas.data_ptr());
-
-  // gemm1 intermediate ws
-  workspace.gemm1_output = gemm1_output.data_ptr();
-  workspace.gemm1_output_scale = static_cast<float*>(gemm1_output_scale.data_ptr());
-  // activation intermediate ws
-  workspace.activation_output = activation_output.data_ptr();
-  workspace.activation_output_scale = static_cast<float*>(activation_output_scale.data_ptr());
-  // gemm2 intermediate ws
-  workspace.gemm2_output = gemm2_output.data_ptr();
-  workspace.gemm2_output_scale = nullptr;
-  args.output = output.data_ptr();
-  args.output_scale = nullptr;
-
-  tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner moe_runner(
-      args.mDtypeElt, args.mUseDeepSeekFp8, tile_tokens_dim, /*useShuffledMatrixA*/ true);
-
-  auto const moeConfigIndex =
-      moe_runner.getDefaultValidConfigIndex(args.top_k, args.hidden_size, args.intermediate_size,
-                                            args.local_num_experts, args.num_tokens);
-
-  auto workspace_sizes = moe_runner.getWorkspaceSizeInBytes(args, moeConfigIndex);
-  Tensor workspace_fc1 =
-      alloc_tensor({std::get<0>(workspace_sizes)}, dl_int8, hidden_states.device());
-  Tensor workspace_fc2 =
-      alloc_tensor({std::get<1>(workspace_sizes)}, dl_int8, hidden_states.device());
-  workspace.bmm1_workspace = workspace_fc1.data_ptr();
-  workspace.bmm2_workspace = workspace_fc2.data_ptr();
-  cudaStream_t moe_stream = get_stream(hidden_states.device());
-  moe_runner.run(args, workspace, hidden_states.device().device_id, moe_stream, moeConfigIndex,
-                 enable_pdl);
-}
+    int64_t Mn = 0, K = 0;
+    if (weight_layout == batchedGemm::gemm::MatrixLayout::MajorK) {
+      // MajorK [num_experts, M, K]
+      Mn = weights.size(1);
+      K = weights.size(2);
+    } else if (weight_layout == batchedGemm::gemm::MatrixLayout::BlockMajorK) {
+      // BlockMajorK [num_experts, K/block_k, M, block_k]
+      Mn = weights.size(2);
+      int64_t block_k = weights.size(3);
+      K = weights.size(1) * block_k;
+    } else {
+      TVM_FFI_LOG_AND_THROW(NotImplementedError)
+          << "Unsupported weight_layout: " << (int)weight_layout;
+    }
+    if (which_weights == "gemm1") {
+      TVM_FFI_ICHECK_EQ(Mn % 2, 0) << which_weights << " weights Mn dimension must be even.";
+      TVM_FFI_ICHECK_EQ(args->intermediate_size, Mn / 2)
+          << "intermediate_size has incorrect shape.";
+      TVM_FFI_ICHECK_EQ(K, hidden_states.size(1))
+          << which_weights << " weights K dimension must be equal to hidden_size.";
+    } else if (which_weights == "gemm2") {
+      TVM_FFI_ICHECK_EQ(K, args->intermediate_size)
+          << which_weights << " weights K dimension must be equal to intermediate_size.";
+    }
+  }
 
-void trtllm_fp8_per_tensor_scale_moe(
-    TensorView routing_logits, Optional<TensorView> routing_bias, TensorView hidden_states,
-    TensorView gemm1_weights, TensorView output1_scales_scalar,
-    TensorView output1_scales_gate_scalar, TensorView gemm2_weights,
-    TensorView output2_scales_scalar, TensorView output, int64_t num_experts, int64_t top_k,
-    Optional<int64_t> n_group, Optional<int64_t> topk_group, int64_t intermediate_size,
-    int64_t local_expert_offset, int64_t local_num_experts, Optional<double> routed_scaling_factor,
-    bool use_routing_scales_on_input, int64_t tile_tokens_dim, int64_t routing_method_type,
-    bool enable_pdl) {
-  auto dtype = hidden_states.dtype();
-  if (dtype == dl_float16 || dtype == dl_bfloat16 || dtype == dl_float8_e4m3fn) {
-    trtllm_fp8_per_tensor_scale_moe_launcher(
-        routing_logits, routing_bias, hidden_states, gemm1_weights, output1_scales_scalar,
-        output1_scales_gate_scalar, gemm2_weights, output2_scales_scalar, output, num_experts,
-        top_k, n_group, topk_group, intermediate_size, local_expert_offset, local_num_experts,
-        routed_scaling_factor, use_routing_scales_on_input, tile_tokens_dim, routing_method_type,
-        enable_pdl);
-  } else {
-    TVM_FFI_LOG_AND_THROW(NotImplementedError) << "Unsupported input dtype.";
+  void check_routing_common() const {
+    TVM_FFI_ICHECK(args->top_k > 0 && args->top_k <= args->num_experts)
+        << "top_k must be between 1 and num_experts";
+    TVM_FFI_ICHECK(args->local_num_experts > 0 && args->local_num_experts <= args->num_experts)
+        << "local_num_experts must be between 1 and num_experts";
+    TVM_FFI_ICHECK(args->local_expert_offset >= 0 &&
+                   args->local_expert_offset + args->local_num_experts <= args->num_experts)
+        << "expert offset and count must be within valid range";
+
+    check_routing_logits_shape();
+
+    if (routing_bias.has_value()) {
+      check_routing_bias_shape();
+    }
   }
-}
 
-void trtllm_fp8_block_scale_moe_launcher(
-    TensorView routing_logits, Optional<TensorView> routing_bias, TensorView hidden_states,
-    TensorView hidden_states_scale, TensorView gemm1_weights, TensorView gemm1_weights_scale,
-    TensorView gemm2_weights, TensorView gemm2_weights_scale, TensorView output,
-    int64_t const num_experts, int64_t const top_k, Optional<int64_t> const n_group,
-    Optional<int64_t> const topk_group, int64_t const intermediate_size,
-    int64_t const local_expert_offset, int64_t const local_num_experts,
-    Optional<double> const routed_scaling_factor, int64_t const tile_tokens_dim,
-    int64_t const routing_method_type,
-    tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner& moe_runner, int64_t moeConfigIndex,
-    bool enable_pdl) {
-  static const std::tuple<int, int> device_props = [hidden_states] {
-    int major, minor;
-    cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor,
-                           hidden_states.device().device_id);
-    cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor,
-                           hidden_states.device().device_id);
-    return std::make_tuple(major, minor);
-  }();
-
-  TVM_FFI_ICHECK_EQ(std::get<0>(device_props), 10)
-      << "This kernel requires 10.x architecture. Current device has SM "
-      << std::get<0>(device_props) << std::get<1>(device_props);
+  // Routing phase workspace tensors (allocated in prepare_routing() or prepare_routing_common())
+  Tensor num_tokens_per_expert;
+  Tensor total_num_padded_tokens;
+  Tensor expanded_idx_to_permuted_idx;
+  Tensor permuted_idx_to_token_idx;
+  Tensor expert_weights;
+  Tensor expert_indexes;
+  Tensor expert_count_histogram;
+  Tensor cta_idx_xy_to_batch_idx;
+  Tensor cta_idx_xy_to_mn_limit;
+  Tensor num_non_exiting_ctas;
+
+  void prepare_routing_common() {
+    // Allocate routing phase workspace tensors
+    num_tokens_per_expert = alloc_tensor({args->num_experts}, dl_int32, hidden_states.device());
+    int32_t max_num_padded_tokens =
+        tensorrt_llm::kernels::trtllmgen_moe::Routing::getMaxPermutedPaddedCount(
+            args->num_tokens, args->top_k, args->num_experts, tile_tokens_dim);
+
+    total_num_padded_tokens = alloc_tensor({1}, dl_int32, hidden_states.device());
+
+    expanded_idx_to_permuted_idx =
+        alloc_tensor({args->num_tokens * args->top_k}, dl_int32, hidden_states.device());
+
+    permuted_idx_to_token_idx =
+        alloc_tensor({max_num_padded_tokens}, dl_int32, hidden_states.device());
+
+    expert_indexes =
+        alloc_tensor({args->num_tokens, args->top_k}, dl_int32, hidden_states.device());
+
+    // expert_weights allocation should be done by derived class since data type could vary
+
+    int64_t const size_of_expert_count_histogram = std::max(args->num_experts * 2, 256 * 2);
+    expert_count_histogram = alloc_tensor({size_of_expert_count_histogram},
+                                          dl_int32,  // 256 is the max number of threads per block
+                                                     // and max number of experts
+                                          hidden_states.device());
+
+    int32_t max_num_ctas = tensorrt_llm::kernels::trtllmgen_moe::Routing::getMaxNumCtasInBatchDim(
+        args->num_tokens, args->top_k, args->num_experts, tile_tokens_dim);
+
+    cta_idx_xy_to_batch_idx = alloc_tensor({max_num_ctas}, dl_int32, hidden_states.device());
+
+    cta_idx_xy_to_mn_limit = alloc_tensor({max_num_ctas}, dl_int32, hidden_states.device());
+
+    num_non_exiting_ctas = alloc_tensor({1}, dl_int32, hidden_states.device());
+
+    workspace.total_num_padded_tokens = static_cast<int*>(total_num_padded_tokens.data_ptr());
+    workspace.total_max_padded_tokens = max_num_padded_tokens;
+    workspace.ProjUpTileN = tile_tokens_dim;
+    workspace.routing_expert_indexes = static_cast<int*>(expert_indexes.data_ptr());
+    workspace.permuted_idx_size = static_cast<int*>(total_num_padded_tokens.data_ptr());
+    workspace.expanded_idx_to_permuted_idx =
+        static_cast<int*>(expanded_idx_to_permuted_idx.data_ptr());
+    workspace.permuted_idx_to_token_idx = static_cast<int*>(permuted_idx_to_token_idx.data_ptr());
+    // workspace.expert_weights will be set by derived class after expert_weights allocation
+    workspace.cta_idx_xy_to_batch_idx = static_cast<int*>(cta_idx_xy_to_batch_idx.data_ptr());
+    workspace.cta_idx_xy_to_mn_limit = static_cast<int*>(cta_idx_xy_to_mn_limit.data_ptr());
+    workspace.num_non_exiting_ctas = static_cast<int*>(num_non_exiting_ctas.data_ptr());
+  }
 
-  if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::DeepSeekV3) {
-    TVM_FFI_ICHECK_EQ(routing_logits.dtype(), dl_float32) << "routing_logits must be float.";
-  } else {
-    TVM_FFI_ICHECK_EQ(routing_logits.dtype(), dl_bfloat16) << "routing_logits must be bfloat16.";
+  void check_moe_common() const {
+    // Hidden states [num_tokens, hidden_size]
+    TVM_FFI_ICHECK_EQ(hidden_states.ndim(), 2) << "hidden_states must be 2D.";
   }
-  TVM_FFI_ICHECK_EQ(routing_logits.ndim(), 2) << "routing_logits must be 2D.";
-  TVM_FFI_ICHECK_EQ(routing_logits.size(0), hidden_states.size(0))
-      << "routing_logits and hidden_states must have the same number of tokens.";
-  TVM_FFI_ICHECK_EQ(routing_logits.size(1), num_experts)
-      << "routing_logits dim1 must match num_experts.";
-  if (routing_bias.has_value()) {
-    TVM_FFI_ICHECK(routing_bias.value().dtype() == dl_bfloat16 ||
-                   routing_bias.value().dtype() == dl_float32)
-        << "routing_bias must be bfloat16 or float.";
-    TVM_FFI_ICHECK_EQ(routing_bias.value().ndim(), 1) << "routing_bias must be 1D.";
-    TVM_FFI_ICHECK_EQ(routing_bias.value().size(0), num_experts)
-        << "routing_bias has incorrect shape.";
+
+  // MoE computation phase workspace tensors (allocated in prepare_moe() or prepare_moe_common())
+  Tensor gemm1_output;
+  Tensor activation_output;
+  Tensor gemm2_output;
+  Tensor workspace_fc1;
+  Tensor workspace_fc2;
+  Tensor output;
+  int64_t moe_tactic{-1};
+  std::unique_ptr<tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner> moe_runner;
+
+  void prepare_moe_common(int64_t& moe_tactic) {
+    using RunnerType = tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner;
+    // For FP8 block-scale (E4m3 activations, E4m3 weights) with DeepSeek FP8, use the
+    // weights-only Runner constructor to match the original kernel path and numerics.
+    if (this->mDtypeAct == btg::Dtype::E4m3 && this->mDtypeWeights == btg::Dtype::E4m3 &&
+        args->mUseDeepSeekFp8) {
+      moe_runner = std::make_unique<RunnerType>(this->mDtypeWeights, args->mUseDeepSeekFp8,
+                                                (int32_t)tile_tokens_dim, this->use_shuffled_weight,
+                                                this->weight_layout);
+    } else {
+      moe_runner = std::make_unique<RunnerType>(this->mDtypeAct, this->mDtypeWeights,
+                                                args->mUseDeepSeekFp8, (int32_t)tile_tokens_dim,
+                                                static_cast<GatedActType>(this->gated_act_type),
+                                                this->use_shuffled_weight, this->weight_layout);
+    }
+
+    if (moe_tactic == -1) {
+      moe_tactic = moe_runner->getDefaultValidConfigIndex(
+          args->top_k, args->hidden_size, args->intermediate_size, args->local_num_experts,
+          args->num_tokens);
+    }
+    this->moe_tactic = moe_tactic;
+
+    auto workspace_sizes = moe_runner->getWorkspaceSizeInBytes(*args, moe_tactic);
+    workspace_fc1 = alloc_tensor({std::get<0>(workspace_sizes)}, dl_int8, hidden_states.device());
+    workspace_fc2 = alloc_tensor({std::get<1>(workspace_sizes)}, dl_int8, hidden_states.device());
+    workspace.bmm1_workspace = workspace_fc1.data_ptr();
+    workspace.bmm2_workspace = workspace_fc2.data_ptr();
   }
 
-  if (n_group.has_value() && n_group.value() != 0) {
-    TVM_FFI_ICHECK(static_cast<RoutingMethodType>(routing_method_type) ==
-                   RoutingMethodType::DeepSeekV3)
-        << "Routing kernel with groups implies DeepSeekV3 routing method.";
-    TVM_FFI_ICHECK(topk_group.has_value()) << "if n_group is given, topk_group must be given";
-    TVM_FFI_ICHECK_EQ(num_experts % n_group.value(), 0)
-        << "num_experts must be divisible by n_group";
-    TVM_FFI_ICHECK(top_k <= 8 && top_k > 0)
-        << "Current routing kernel (with groups) only supports top_k<=8 && top_k>0.";
-    TVM_FFI_ICHECK(topk_group.value() <= 4 && topk_group.value() > 0)
-        << "Current routing kernel only (with groups) supports topk_group<=4 && topk_group > 0.";
-    TVM_FFI_ICHECK_LE(topk_group.value(), n_group.value())
-        << "n_group must not be smaller than topk_group.";
-    // This check ensures we have enough experts in the selected groups to handle the top_k routing
-    TVM_FFI_ICHECK_LT(top_k, (topk_group.value() * num_experts / n_group.value()))
-        << "top_k must be less than total number of experts in selected groups";
-  } else if (static_cast<RoutingMethodType>(routing_method_type) ==
-                 RoutingMethodType::Renormalize ||
-             static_cast<RoutingMethodType>(routing_method_type) ==
-                 RoutingMethodType::RenormalizeNaive) {
-    TVM_FFI_ICHECK(top_k <= 10 && top_k > 0)
-        << "Current routing kernel (no groups, renormalize) only supports top_k<=10 && top_k>0.";
-  } else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Llama4) {
-    TVM_FFI_ICHECK_EQ(top_k, 1)
-        << "Current routing kernel (no groups, Llama4) only supports top_k=1.";
+ public:
+  virtual void check_routing() const = 0;
+  virtual void prepare_routing() = 0;
+  virtual void check_moe() const = 0;
+  virtual void prepare_moe(int64_t& moe_tactic) = 0;
+
+  // Main entry point for all the executions.
+  // Do initializations prior to calling this as the initializations are different for bf16, fp8 and
+  // fp4. The executions are non-blocking by default.
+  virtual Array<Tensor> run(int64_t moe_tactic, bool enable_pdl = true,
+                            bool use_routing_scales_on_input = false,
+                            bool use_deep_seek_fp8 = false) {
+    check_routing();
+    prepare_routing();
+
+    // Execute routing
+    tensorrt_llm::kernels::trtllmgen_moe::Routing::Runner routing_runner(tile_tokens_dim);
+    cudaStream_t routing_stream = get_stream(hidden_states.device());
+
+    routing_runner.run(
+        args->routing_logits, args->routing_bias, args->num_tokens, args->num_experts, args->top_k,
+        args->n_group, args->topk_group, args->local_expert_offset, args->local_num_experts,
+        args->routed_scaling_factor, static_cast<int*>(expert_indexes.data_ptr()),
+        static_cast<int*>(expert_count_histogram.data_ptr()),
+        static_cast<int*>(total_num_padded_tokens.data_ptr()),
+        static_cast<int*>(expanded_idx_to_permuted_idx.data_ptr()),
+        nullptr /*permuted_idx_to_expanded_idx.data_ptr()*/,
+        static_cast<int*>(permuted_idx_to_token_idx.data_ptr()), expert_weights.data_ptr(),
+        static_cast<int*>(num_tokens_per_expert.data_ptr()),
+        static_cast<int*>(cta_idx_xy_to_batch_idx.data_ptr()),
+        static_cast<int*>(cta_idx_xy_to_mn_limit.data_ptr()),
+        static_cast<int*>(num_non_exiting_ctas.data_ptr()), args->mDtypeElt, mRoutingBiasDtype,
+        use_routing_scales_on_input, use_deep_seek_fp8,
+        static_cast<RoutingMethodType>(routing_method_type), routing_stream);
+
+    check_moe();
+    prepare_moe(moe_tactic);
+
+    cudaStream_t moe_stream = get_stream(hidden_states.device());
+    moe_runner->run(*args, workspace, hidden_states.device().device_id, moe_stream, moe_tactic,
+                    enable_pdl);
+
+    if (args->do_finalize) {
+      return {output};
+    }
+    return {gemm2_output, expert_weights, expanded_idx_to_permuted_idx};
   }
-  TVM_FFI_ICHECK_EQ(num_experts % 4, 0)
-      << "Routing kernel expects that num_experts must be divisible by 4";
-  TVM_FFI_ICHECK_GT(num_experts, top_k) << "num_experts must be greater than top_k";
+};
+
+void FusedMoeLauncher::init_common(
+    std::unique_ptr<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>&& args,
+    int64_t tile_tokens_dim, int64_t routing_method_type, bool use_shuffled_weight,
+    int64_t weight_layout, int64_t gated_act_type) {
+  // Check devicearchitecture: Blackwell (SM 10.x) required
+  auto device = hidden_states.device().device_id;
+  int major = 0, minor = 0;
+  cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device);
+  cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device);
+  TVM_FFI_ICHECK_EQ(major, 10) << "MoE kernel requires 10.x architecture. Current device has SM "
+                               << major << minor;
+  this->device_version = std::make_tuple(major, minor);
+
+  args->routing_logits = routing_logits.has_value() ? routing_logits.value().data_ptr() : nullptr;
+  args->routing_bias = routing_bias.has_value() ? routing_bias.value().data_ptr() : nullptr;
+  args->hidden_states = hidden_states.data_ptr();
+  args->gemm1_weights = gemm1_weights.data_ptr();
+  args->gemm2_weights = gemm2_weights.data_ptr();
+
+  this->args = std::move(args);
+  this->tile_tokens_dim = tile_tokens_dim;
+  this->routing_method_type = routing_method_type;
+  this->use_shuffled_weight = use_shuffled_weight;
+  TVM_FFI_ICHECK(0 <= weight_layout && weight_layout <= 2)
+      << "the value of weight_layout is not recognized";
+  this->weight_layout = static_cast<batchedGemm::gemm::MatrixLayout>(weight_layout);
+  TVM_FFI_ICHECK(0 <= gated_act_type && gated_act_type <= 1)
+      << "the value of gated_act_type is not recognized";
+  this->gated_act_type = static_cast<GatedActType>(gated_act_type);
+}
 
-  tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs args;
-  tensorrt_llm::kernels::trtllmgen_moe::MoE::MoEWorkspace workspace;
+class Bf16MoeLauncher : public FusedMoeLauncher {
+ public:
+  static constexpr std::array<int32_t, 5> mSupportedTileNums = {8, 16, 32, 64, 128};
+
+  Bf16MoeLauncher(TensorView const& routing_logits, Optional<TensorView> const& routing_bias,
+                  TensorView const& hidden_states, TensorView const& gemm1_weights,
+                  TensorView const& gemm2_weights)
+      : FusedMoeLauncher(Optional<TensorView>(routing_logits), routing_bias, hidden_states,
+                         gemm1_weights, Optional<TensorView>(), Optional<TensorView>(),
+                         gemm2_weights, Optional<TensorView>()) {}
+
+  void init(std::unique_ptr<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>&& args,
+            int64_t tile_tokens_dim, int64_t routing_method_type, bool use_shuffled_weight,
+            int64_t weight_layout) {
+    constexpr int64_t gated_act_type =
+        static_cast<int64_t>(GatedActType::SwiGlu);  // not exposed in api for now
+
+    // Do base class init and perform common checks
+    FusedMoeLauncher::init_common(std::move(args), tile_tokens_dim, routing_method_type,
+                                  use_shuffled_weight, weight_layout, gated_act_type);
+  }
 
-  // Convert PyTorch dtype to TensorRT-LLM dtype
-  auto dtype = hidden_states.dtype();
-  if (dtype == dl_float16) {
-    args.mDtypeElt = btg::Dtype::Fp16;
-  } else if (dtype == dl_bfloat16) {
-    args.mDtypeElt = btg::Dtype::Bfloat16;
-  } else if (dtype == dl_float8_e4m3fn) {
-    args.mDtypeElt = btg::Dtype::E4m3;
-  } else {
-    TVM_FFI_LOG_AND_THROW(NotImplementedError) << "Unsupported input dtype for MoE.";
+  void check_routing() const override {
+    FusedMoeLauncher::check_routing_common();
+
+    // TODO n_group, topk_group validation?
   }
 
-  auto const routing_bias_dtype =
-      routing_bias.has_value() ? routing_bias.value().dtype() : dl_bfloat16;
-  auto btg_routing_bias_dtype =
-      routing_bias_dtype == dl_bfloat16 ? btg::Dtype::Bfloat16 : btg::Dtype::Fp32;
-
-  args.routing_logits = static_cast<float*>(routing_logits.data_ptr());
-  args.routing_bias = routing_bias.has_value() ? routing_bias.value().data_ptr() : nullptr;
-  args.hidden_states = hidden_states.data_ptr();
-  args.hidden_states_scale = static_cast<float*>(hidden_states_scale.data_ptr());
-  args.gemm1_weights = gemm1_weights.data_ptr();
-  args.gemm1_weights_scale = static_cast<float*>(gemm1_weights_scale.data_ptr());
-  args.gemm2_weights = gemm2_weights.data_ptr();
-  args.gemm2_weights_scale = static_cast<float*>(gemm2_weights_scale.data_ptr());
-  args.num_tokens = hidden_states.size(0);
-  args.num_experts = num_experts;
-  args.hidden_size = hidden_states.size(1);
-  args.hidden_size_output = args.hidden_size;
-  args.top_k = top_k;
-  args.n_group = n_group.has_value() ? n_group.value() : 0;
-  args.topk_group = topk_group.has_value() ? topk_group.value() : 0;
-  args.local_expert_offset = local_expert_offset;
-  args.local_num_experts = local_num_experts;
-  args.routed_scaling_factor =
-      routed_scaling_factor.has_value() ? routed_scaling_factor.value() : 1.0;
-  args.intermediate_size = intermediate_size;
-  args.mUseDeepSeekFp8 = true;
-
-  // allocate workspace for routing kernel
-  Tensor num_tokens_per_expert = alloc_tensor({num_experts}, dl_int32, routing_logits.device());
-  int32_t max_num_padded_tokens =
-      tensorrt_llm::kernels::trtllmgen_moe::Routing::getMaxPermutedPaddedCount(
-          args.num_tokens, top_k, num_experts, tile_tokens_dim);
-  int32_t max_num_padded_tokens_gemm1 =
-      tensorrt_llm::kernels::trtllmgen_moe::Routing::maybeGetMinTokenCount(
-          max_num_padded_tokens, args.intermediate_size, btg::dtypeGetNumBits(args.mDtypeElt));
-  int32_t max_num_padded_tokens_gemm2 =
-      tensorrt_llm::kernels::trtllmgen_moe::Routing::maybeGetMinTokenCount(
-          max_num_padded_tokens, args.hidden_size, btg::dtypeGetNumBits(args.mDtypeOut));
-  Tensor total_num_padded_tokens = alloc_tensor({1}, dl_int32, routing_logits.device());
-  Tensor expanded_idx_to_permuted_idx =
-      alloc_tensor({args.num_tokens * args.top_k}, dl_int32, routing_logits.device());
-  Tensor permuted_idx_to_token_idx =
-      alloc_tensor({max_num_padded_tokens}, dl_int32, routing_logits.device());
-
-  Tensor expert_weights =
-      alloc_tensor({args.num_tokens, args.top_k}, dl_bfloat16, routing_logits.device());
-  // NOTE: the output type of routing kernel is currently always bfloat16
-  Tensor expert_indexes =
-      alloc_tensor({args.num_tokens, args.top_k}, dl_int32, routing_logits.device());
-  int64_t const size_of_expert_count_histogram = std::max(num_experts * 2, int64_t(256 * 2));
-  Tensor expert_count_histogram = alloc_tensor(
-      {size_of_expert_count_histogram},
-      dl_int32,  // 256 is the max number of threads per block and max number of experts
-      routing_logits.device());
-
-  // allocate workspace for activation/gemm/finalize kernels
-  // Tensor gemm1_output = alloc_tensor({max_num_padded_tokens, 2 * intermediate_size},
-  //                                    dl_float8_e4m3fn, hidden_states.device());
-  // Tensor activation_output = alloc_tensor({max_num_padded_tokens, intermediate_size},
-  //                                         dl_float8_e4m3fn, hidden_states.device());
-  Tensor gemm1_output = alloc_tensor({max_num_padded_tokens_gemm1, 2 * intermediate_size}, dl_uint8,
-                                     hidden_states.device());
-  Tensor gemm1_output_scale = alloc_tensor({2 * intermediate_size / 128, max_num_padded_tokens},
-                                           dl_float32, hidden_states.device());
-  Tensor activation_output = alloc_tensor({max_num_padded_tokens_gemm1, intermediate_size},
-                                          dl_uint8, hidden_states.device());
-  Tensor activation_output_scale = alloc_tensor(
-      {intermediate_size / 128, max_num_padded_tokens_gemm1}, dl_float32, hidden_states.device());
-  Tensor gemm2_output = alloc_tensor({max_num_padded_tokens_gemm2, args.hidden_size}, dl_bfloat16,
+  void prepare_routing() override {
+    FusedMoeLauncher::prepare_routing_common();
+
+    args->mDtypeElt = btg::Dtype::Bfloat16;
+    args->mUseDeepSeekFp8 = false;
+
+    // Set expert weights dtype based on routing bias
+    auto const routing_bias_dtype =
+        routing_bias.has_value() ? routing_bias.value().dtype() : dl_bfloat16;
+    mRoutingBiasDtype = routing_bias_dtype == dl_bfloat16 ? btg::Dtype::Bfloat16 : btg::Dtype::Fp32;
+
+    expert_weights =
+        alloc_tensor({args->num_tokens, args->top_k}, dl_bfloat16, hidden_states.device());
+
+    workspace.expert_weights = expert_weights.data_ptr();
+  }
+
+  void check_moe() const override {
+    FusedMoeLauncher::check_moe_common();
+
+    TVM_FFI_ICHECK(weight_layout == batchedGemm::gemm::MatrixLayout::BlockMajorK)
+        << "BF16 Moe: weight_layout must be BlockMajorK";
+    check_weights_shape("gemm1");
+    check_weights_shape("gemm2");
+
+    TVM_FFI_ICHECK_EQ(args->intermediate_size % 128, 0)
+        << "the second dimension of weights must be a multiple of 128.";
+  }
+
+  void prepare_moe(int64_t& moe_tactic) override {
+    FusedMoeLauncher::prepare_moe_common(moe_tactic);
+
+    int32_t max_num_padded_tokens = workspace.total_max_padded_tokens;
+    gemm1_output = alloc_tensor({max_num_padded_tokens, args->intermediate_size}, dl_bfloat16,
+                                hidden_states.device());
+    activation_output = alloc_tensor({max_num_padded_tokens, args->intermediate_size}, dl_bfloat16,
                                      hidden_states.device());
+    gemm2_output = alloc_tensor({max_num_padded_tokens, args->hidden_size}, dl_bfloat16,
+                                hidden_states.device());
+
+    workspace.hidden_states_scale_linear = nullptr;
+    workspace.gemm1_output = gemm1_output.data_ptr();
+    workspace.gemm1_output_scale = nullptr;
+    workspace.activation_output = activation_output.data_ptr();
+    workspace.activation_output_scale = nullptr;
+    workspace.gemm2_output = gemm2_output.data_ptr();
+    workspace.gemm2_output_scale = nullptr;
+
+    output =
+        alloc_tensor({args->num_tokens, args->hidden_size}, dl_bfloat16, hidden_states.device());
+    args->output = output.data_ptr();
+    args->output_scale = nullptr;
+  }
 
-  int32_t max_num_ctas = tensorrt_llm::kernels::trtllmgen_moe::Routing::getMaxNumCtasInBatchDim(
-      args.num_tokens, args.top_k, args.num_experts, tile_tokens_dim);
-  Tensor cta_idx_xy_to_batch_idx = alloc_tensor({max_num_ctas}, dl_int32, routing_logits.device());
-  Tensor cta_idx_xy_to_mn_limit = alloc_tensor({max_num_ctas}, dl_int32, routing_logits.device());
-  Tensor num_non_exiting_ctas = alloc_tensor({1}, dl_int32, routing_logits.device());
-
-  tensorrt_llm::kernels::trtllmgen_moe::Routing::Runner routing_runner(tile_tokens_dim);
-  cudaStream_t stream = get_stream(routing_logits.device());
-  routing_runner.run(static_cast<float*>(routing_logits.data_ptr()), args.routing_bias,
-                     args.num_tokens, args.num_experts, args.top_k, args.n_group, args.topk_group,
-                     args.local_expert_offset, args.local_num_experts, args.routed_scaling_factor,
-                     static_cast<int*>(expert_indexes.data_ptr()),
-                     static_cast<int*>(expert_count_histogram.data_ptr()),
-                     static_cast<int*>(total_num_padded_tokens.data_ptr()),
-                     static_cast<int*>(expanded_idx_to_permuted_idx.data_ptr()),
-                     nullptr /*static_cast<int*>(permuted_idx_to_expanded_idx.data_ptr())*/,
-                     static_cast<int*>(permuted_idx_to_token_idx.data_ptr()),
-                     expert_weights.data_ptr(), static_cast<int*>(num_tokens_per_expert.data_ptr()),
-                     static_cast<int*>(cta_idx_xy_to_batch_idx.data_ptr()),
-                     static_cast<int*>(cta_idx_xy_to_mn_limit.data_ptr()),
-                     static_cast<int*>(num_non_exiting_ctas.data_ptr()), args.mDtypeElt,
-                     btg_routing_bias_dtype, false /* use_routing_scales_on_input */,
-                     true /* use_deep_seek_fp8 */,
-                     static_cast<RoutingMethodType>(routing_method_type), stream);
-
-  // MoE kernel except routing
-  TVM_FFI_ICHECK_EQ(hidden_states.dtype(), dl_float8_e4m3fn) << "hidden_states must be fp8.";
-  TVM_FFI_ICHECK_EQ(hidden_states_scale.dtype(), dl_float32)
-      << "hidden_states_scale must be float.";
-  TVM_FFI_ICHECK_EQ(hidden_states_scale.ndim(), 2) << "hidden_states_scale must be 2D.";
-  TVM_FFI_ICHECK_EQ(hidden_states_scale.size(0), hidden_states.size(1) / 128)
-      << "hidden_states_scale dim0 must match hidden_states dim1 / 128.";
-  TVM_FFI_ICHECK_EQ(hidden_states_scale.size(1), args.num_tokens)
-      << "hidden_states_scale dim1 must match num_tokens.";
-  TVM_FFI_ICHECK_EQ(gemm1_weights.dtype(), dl_float8_e4m3fn) << "gemm1_weights must be fp8.";
-
-  TVM_FFI_ICHECK(gemm1_weights.ndim() == 3 || gemm1_weights.ndim() == 4)
-      << "gemm1_weights must be 3D or 4D.";
-  {
-    int64_t Mn = 0, K = 0;
-    if (gemm1_weights.ndim() == 3) {
-      // MajorK [num_experts, M, K]
-      Mn = gemm1_weights.size(1);
-      K = gemm1_weights.size(2);
-    } else if (gemm1_weights.ndim() == 4) {
-      // BlockMajorK [num_experts, K/block_k, M, block_k]
-      Mn = gemm1_weights.size(2);
-      int64_t block_k = gemm1_weights.size(3);
-      K = gemm1_weights.size(1) * block_k;
+  static Array<Array<int64_t>> getValidConfigs(int64_t top_k, int64_t hidden_size,
+                                               int64_t intermediate_size, int64_t num_local_experts,
+                                               int64_t num_tokens, int64_t gated_act_type,
+                                               bool use_shuffled_weight, int64_t weight_layout) {
+    Array<Array<int64_t>> valid_configs;
+
+    std::vector<int32_t> supported_tile_nums(mSupportedTileNums.begin(), mSupportedTileNums.end());
+    std::set<int32_t> selected_tile_nums =
+        computeSelectedTileN(supported_tile_nums, num_tokens, top_k, num_local_experts);
+
+    for (int32_t tile_N : selected_tile_nums) {
+      auto moe_runner = std::make_unique<tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner>(
+          btg::Dtype::Bfloat16,  // dtype_act
+          btg::Dtype::Bfloat16,  // dtype_weights
+          false,                 // useDeepSeekFp8
+          tile_N, static_cast<GatedActType>(gated_act_type), use_shuffled_weight,
+          static_cast<batchedGemm::gemm::MatrixLayout>(weight_layout));
+
+      auto cfgs = moe_runner->getValidConfigIndices(top_k, hidden_size, intermediate_size,
+                                                    num_local_experts, num_tokens);
+
+      for (auto cfg : cfgs) {
+        valid_configs.push_back({tile_N, cfg});
+      }
     }
-    TVM_FFI_ICHECK_EQ(Mn % 2, 0) << "the second dimension of weights must be even.";
-    TVM_FFI_ICHECK_EQ(intermediate_size, Mn / 2) << "intermediate_size has incorrect shape.";
-    TVM_FFI_ICHECK_EQ(K, hidden_states.size(1))
-        << "the third dimension of weights must be equal to hidden_size.";
+
+    return valid_configs;
   }
-  TVM_FFI_ICHECK_EQ(gemm1_weights_scale.dtype(), dl_float32)
-      << "gemm1_weights_scale must be float.";
-  TVM_FFI_ICHECK_EQ(gemm1_weights_scale.ndim(), 3) << "gemm1_weights_scale must be 3D.";
-
-  TVM_FFI_ICHECK_EQ(gemm1_weights_scale.size(0), local_num_experts)
-      << "gemm1_weights_scale has incorrect shape.";
-  TVM_FFI_ICHECK_EQ(intermediate_size % 128, 0)
-      << "the second dimension of weights must be a multiple of 128.";
-  TVM_FFI_ICHECK_EQ(gemm1_weights_scale.size(1), 2 * intermediate_size / 128)
-      << "gemm1_weights_scale has incorrect shape.";
-  TVM_FFI_ICHECK_EQ(gemm1_weights_scale.size(2), args.hidden_size / 128)
-      << "gemm1_weights_scale has incorrect shape.";
-  TVM_FFI_ICHECK_EQ(gemm2_weights.dtype(), dl_float8_e4m3fn) << "gemm2_weights must be fp8.";
-
-  TVM_FFI_ICHECK(gemm2_weights.ndim() == 3 || gemm2_weights.ndim() == 4)
-      << "gemm2_weights must be 3D or 4D.";
-  {
-    int64_t K = 0;
-    if (gemm2_weights.ndim() == 3) {
-      // MajorK [num_experts, M, K]
-      K = gemm2_weights.size(2);
-    } else if (gemm2_weights.ndim() == 4) {
-      // BlockMajorK [num_experts, K/block_k, M, block_k]
-      int64_t block_k = gemm2_weights.size(3);
-      K = gemm2_weights.size(1) * block_k;
+};
+
+class Fp8PerTensorLauncher : public FusedMoeLauncher {
+ public:
+  static constexpr std::array<int32_t, 5> mSupportedTileNums = {8, 16, 32, 64, 128};
+
+  // Constructor that passes TensorView parameters to base constructor
+  Fp8PerTensorLauncher(TensorView const& routing_logits, Optional<TensorView> const& routing_bias,
+                       TensorView const& hidden_states, TensorView const& gemm1_weights,
+                       TensorView const& output1_scales_scalar,
+                       TensorView const& output1_scales_gate_scalar,
+                       TensorView const& gemm2_weights, TensorView const& output2_scales_scalar)
+      : FusedMoeLauncher(Optional<TensorView>(routing_logits), routing_bias, hidden_states,
+                         gemm1_weights, Optional<TensorView>(output1_scales_scalar),
+                         Optional<TensorView>(output1_scales_gate_scalar), gemm2_weights,
+                         Optional<TensorView>(output2_scales_scalar)),
+        use_routing_scales_on_input(false) {}
+
+  void init(std::unique_ptr<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>&& args,
+            int64_t tile_tokens_dim, int64_t routing_method_type, bool use_shuffled_weight,
+            int64_t weight_layout, bool use_routing_scales_on_input_param) {
+    constexpr int64_t gated_act_type =
+        static_cast<int64_t>(GatedActType::SwiGlu);  // not exposed in api for now
+
+    this->use_routing_scales_on_input = use_routing_scales_on_input_param;
+
+    auto dtype = hidden_states.dtype();
+    if (dtype == dl_float16) {
+      mDtypeAct = btg::Dtype::Fp16;
+    } else if (dtype == dl_bfloat16) {
+      mDtypeAct = btg::Dtype::Bfloat16;
+    } else if (dtype == dl_float8_e4m3fn) {
+      mDtypeAct = btg::Dtype::E4m3;
+    } else {
+      TVM_FFI_LOG_AND_THROW(NotImplementedError) << "Unsupported input dtype for FP8 MoE.";
     }
-    TVM_FFI_ICHECK_EQ(K, intermediate_size)
-        << "the third dimension of weights must be equal to intermediate_size.";
+    mDtypeWeights = btg::Dtype::E4m3;
+
+    FusedMoeLauncher::init_common(std::move(args), tile_tokens_dim, routing_method_type,
+                                  use_shuffled_weight, weight_layout, gated_act_type);
   }
-  TVM_FFI_ICHECK_EQ(gemm2_weights_scale.dtype(), dl_float32)
-      << "gemm2_weights_scale must be float.";
-  TVM_FFI_ICHECK_EQ(gemm2_weights_scale.ndim(), 3) << "gemm2_weights_scale must be 3D.";
-  TVM_FFI_ICHECK_EQ(gemm2_weights_scale.size(0), local_num_experts)
-      << "gemm2_weights_scale has incorrect shape.";
-  TVM_FFI_ICHECK_EQ(gemm2_weights_scale.size(1), args.hidden_size / 128)
-      << "gemm2_weights_scale has incorrect shape.";
-  TVM_FFI_ICHECK_EQ(gemm2_weights_scale.size(2), intermediate_size / 128)
-      << "gemm2_weights_scale has incorrect shape.";
-
-  TVM_FFI_ICHECK_EQ(output.size(0), args.num_tokens) << "output has incorrect shape.";
-  TVM_FFI_ICHECK_EQ(output.size(1), args.hidden_size) << "output has incorrect shape.";
-  TVM_FFI_ICHECK_EQ(output.dtype(), dl_bfloat16) << "output must be bf16.";
-
-  // setup workspace
-  workspace.total_num_padded_tokens = static_cast<int*>(total_num_padded_tokens.data_ptr());
-  workspace.total_max_padded_tokens =
-      std::max(max_num_padded_tokens_gemm1, max_num_padded_tokens_gemm2);
-  workspace.ProjUpTileN = tile_tokens_dim;
-  workspace.routing_expert_indexes = static_cast<int*>(expert_indexes.data_ptr());
-  workspace.permuted_idx_size = static_cast<int*>(total_num_padded_tokens.data_ptr());
-  workspace.expanded_idx_to_permuted_idx = static_cast<int*>(
-      expanded_idx_to_permuted_idx.data_ptr());  // Needed by activation/finalize kernels
-  workspace.permuted_idx_to_token_idx =
-      static_cast<int*>(permuted_idx_to_token_idx.data_ptr());  // Needed by permuteGemm1 kernel
-  workspace.expert_weights = expert_weights.data_ptr();         // Consumed by finalize kernel
-
-  workspace.cta_idx_xy_to_batch_idx = static_cast<int*>(cta_idx_xy_to_batch_idx.data_ptr());
-  workspace.cta_idx_xy_to_mn_limit = static_cast<int*>(cta_idx_xy_to_mn_limit.data_ptr());
-  workspace.num_non_exiting_ctas = static_cast<int*>(num_non_exiting_ctas.data_ptr());
-
-  // gemm1 intermediate ws
-  workspace.gemm1_output = gemm1_output.data_ptr();
-  workspace.gemm1_output_scale = static_cast<float*>(gemm1_output_scale.data_ptr());
-  // activation intermediate ws
-  workspace.activation_output = activation_output.data_ptr();
-  workspace.activation_output_scale = static_cast<float*>(activation_output_scale.data_ptr());
-  // gemm2 intermediate ws
-  workspace.gemm2_output = gemm2_output.data_ptr();
-  workspace.gemm2_output_scale = nullptr;
-  args.output = output.data_ptr();
-  args.output_scale = nullptr;
-
-  auto workspace_sizes = moe_runner.getWorkspaceSizeInBytes(args, moeConfigIndex);
-  Tensor workspace_fc1 =
-      alloc_tensor({std::get<0>(workspace_sizes)}, dl_int8, hidden_states.device());
-  Tensor workspace_fc2 =
-      alloc_tensor({std::get<1>(workspace_sizes)}, dl_int8, hidden_states.device());
-  workspace.bmm1_workspace = workspace_fc1.data_ptr();
-  workspace.bmm2_workspace = workspace_fc2.data_ptr();
-
-  cudaStream_t moe_stream = get_stream(hidden_states.device());
-  moe_runner.run(args, workspace, hidden_states.device().device_id, moe_stream, moeConfigIndex,
-                 enable_pdl);
-}
 
-void trtllm_fp8_block_scale_moe(TensorView routing_logits, Optional<TensorView> routing_bias,
-                                TensorView hidden_states, TensorView hidden_states_scale,
-                                TensorView gemm1_weights, TensorView gemm1_weights_scale,
-                                TensorView gemm2_weights, TensorView gemm2_weights_scale,
-                                TensorView output, int64_t num_experts, int64_t top_k,
-                                Optional<int64_t> n_group, Optional<int64_t> topk_group,
-                                int64_t intermediate_size, int64_t local_expert_offset,
-                                int64_t local_num_experts, Optional<double> routed_scaling_factor,
-                                int64_t tile_tokens_dim, int64_t routing_method_type,
-                                bool use_shuffled_weight, int64_t weight_layout, bool enable_pdl) {
-  auto dtype = hidden_states.dtype();
-  if (dtype == dl_float16 || dtype == dl_bfloat16 || dtype == dl_float8_e4m3fn) {
-    using RunnerType = tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner;
+  void check_routing() const override { FusedMoeLauncher::check_routing_common(); }
 
-    btg::Dtype mDtypeElt{btg::Dtype::E4m3};  // FP8 runner so hard-coded
-    bool mUseDeepSeekFp8{true};              // Always true for BlockScaleMoe
+  void prepare_routing() override {
+    FusedMoeLauncher::prepare_routing_common();
 
-    TVM_FFI_ICHECK(0 <= weight_layout && weight_layout <= 2)
-        << "the value of weight_layout is not recognized";
+    auto dtype = hidden_states.dtype();
+    if (dtype == dl_float16) {
+      args->mDtypeElt = btg::Dtype::Fp16;
+    } else if (dtype == dl_bfloat16) {
+      args->mDtypeElt = btg::Dtype::Bfloat16;
+    } else if (dtype == dl_float8_e4m3fn) {
+      args->mDtypeElt = btg::Dtype::E4m3;
+    } else {
+      TVM_FFI_LOG_AND_THROW(NotImplementedError) << "Unsupported input dtype for MoE.";
+    }
 
-    // Properly initialize the runner using make_unique like in the original code
-    auto mRunner = std::make_unique<RunnerType>(
-        mDtypeElt, mUseDeepSeekFp8, tile_tokens_dim, use_shuffled_weight,
-        static_cast<batchedGemm::gemm::MatrixLayout>(weight_layout));
+    args->mDtypeOut = btg::Dtype::Bfloat16;
+    args->mUseDeepSeekFp8 = false;
 
-    // Always use fallback config (equivalent to moeConfigIndex == -1 case from original code)
-    auto const num_tokens = hidden_states.size(0);
-    auto const hidden_size = hidden_states.size(1);
+    auto const routing_bias_dtype =
+        routing_bias.has_value() ? routing_bias.value().dtype() : dl_bfloat16;
+    mRoutingBiasDtype = routing_bias_dtype == dl_bfloat16 ? btg::Dtype::Bfloat16 : btg::Dtype::Fp32;
 
-    int64_t moeConfigIndex = mRunner->getDefaultValidConfigIndex(
-        top_k, hidden_size, intermediate_size, local_num_experts, num_tokens);
+    expert_weights =
+        alloc_tensor({args->num_tokens, args->top_k}, dl_bfloat16, hidden_states.device());
 
-    trtllm_fp8_block_scale_moe_launcher(
-        routing_logits, routing_bias, hidden_states, hidden_states_scale, gemm1_weights,
-        gemm1_weights_scale, gemm2_weights, gemm2_weights_scale, output, num_experts, top_k,
-        n_group, topk_group, intermediate_size, local_expert_offset, local_num_experts,
-        routed_scaling_factor, tile_tokens_dim, routing_method_type, *mRunner, moeConfigIndex,
-        enable_pdl);
-  } else {
-    TVM_FFI_LOG_AND_THROW(NotImplementedError) << "Unsupported hidden state dtype.";
+    workspace.expert_weights = expert_weights.data_ptr();
+    if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Llama4) {
+      workspace.token_scales = expert_weights.data_ptr();  // Consumed by permuteGemm1 kernel
+    }
   }
-}
 
-// TODO(siyuan): This launcher supports flexible weight and activation types.
-// We should cleanup other launchers and only use this one in the future.
-Array<Tensor> trtllm_fp4_block_scale_moe_launcher(
-    Optional<TensorView> routing_logits, TensorView expert_indices, TensorView expert_weights,
-    Optional<TensorView> routing_bias, TensorView hidden_states,
-    Optional<TensorView> hidden_states_scale, TensorView gemm1_weights,
-    TensorView gemm1_weights_scale, Optional<TensorView> gemm1_bias,
-    Optional<TensorView> gemm1_alpha, Optional<TensorView> gemm1_beta,
-    Optional<TensorView> gemm1_clamp_limit, TensorView gemm2_weights,
-    TensorView gemm2_weights_scale, Optional<TensorView> gemm2_bias,
-    Optional<TensorView> output1_scales_scalar, Optional<TensorView> output1_scales_gate_scalar,
-    Optional<TensorView> output2_scales_scalar, int64_t const num_experts, int64_t const top_k,
-    Optional<int64_t> const n_group, Optional<int64_t> const topk_group,
-    int64_t const intermediate_size, int64_t const local_expert_offset,
-    int64_t const local_num_experts, Optional<double> const routed_scaling_factor,
-    int64_t const tile_tokens_dim, int64_t const routing_method_type, bool const do_finalize,
-    tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner& moe_runner, btg::Dtype dtype_act,
-    btg::Dtype dtype_weights, int64_t const moeConfigIndex, bool enable_pdl, TensorView output) {
-  static const std::tuple<int, int> device_props = [hidden_states] {
-    int major, minor;
-    cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor,
-                           hidden_states.device().device_id);
-    cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor,
-                           hidden_states.device().device_id);
-    return std::make_tuple(major, minor);
-  }();
-
-  TVM_FFI_ICHECK_EQ(std::get<0>(device_props), 10)
-      << "This kernel requires 10.x architecture. Current device has SM "
-      << std::get<0>(device_props) << std::get<1>(device_props);
-
-  TVM_FFI_ICHECK(dtype_act == btg::Dtype::E2m1 || dtype_act == btg::Dtype::Bfloat16 ||
-                 dtype_act == btg::Dtype::E4m3 || dtype_act == btg::Dtype::MxE4m3)
-      << "Only E2m1, Bfloat16, MxE4m3 and E4m3 are supported by block scale MoE";
-  if (dtype_act == btg::Dtype::E2m1) {
-    TVM_FFI_ICHECK(dtype_weights == btg::Dtype::E2m1)
-        << "Only E2m1 and MxE2m1 are supported by block scale MoE with E2m1 activation";
-    TVM_FFI_ICHECK(hidden_states_scale.has_value())
-        << "hidden_states_scale is required for E2m1 activation";
+  void check_moe() const override {
+    FusedMoeLauncher::check_moe_common();
+
     TVM_FFI_ICHECK(output1_scales_scalar.has_value())
-        << "output1_scales_scalar is required for E2m1 activation";
+        << "output1_scales_scalar is required for FP8 MoE";
+    TVM_FFI_ICHECK_EQ(output1_scales_scalar.value().dtype(), dl_float32)
+        << "output1_scales_scalar must be float.";
+    TVM_FFI_ICHECK_EQ(output1_scales_scalar.value().ndim(), 1)
+        << "output1_scales_scalar must be 1D.";
+    TVM_FFI_ICHECK_EQ(output1_scales_scalar.value().size(0), args->local_num_experts)
+        << "output1_scales_scalar has incorrect dim 0.";
+
     TVM_FFI_ICHECK(output1_scales_gate_scalar.has_value())
-        << "output1_scales_gate_scalar is required for E2m1 activation";
+        << "output1_scales_gate_scalar is required for FP8 MoE";
+    TVM_FFI_ICHECK_EQ(output1_scales_gate_scalar.value().dtype(), dl_float32)
+        << "output1_scales_gate_scalar must be float.";
+    TVM_FFI_ICHECK_EQ(output1_scales_gate_scalar.value().ndim(), 1)
+        << "output1_scales_gate_scalar must be 1D.";
+    TVM_FFI_ICHECK_EQ(output1_scales_gate_scalar.value().size(0), args->local_num_experts)
+        << "output1_scales_gate_scalar has incorrect dim 0.";
+
     TVM_FFI_ICHECK(output2_scales_scalar.has_value())
-        << "output2_scales_scalar is required for E2m1 activation";
-  } else if (dtype_act == btg::Dtype::Bfloat16 || dtype_act == btg::Dtype::E4m3 ||
-             dtype_act == btg::Dtype::MxE4m3) {
-    TVM_FFI_ICHECK(dtype_weights == btg::Dtype::MxE2m1)
-        << "Only MxE2m1 weights are supported by block scale MoE with Bfloat16, E4m3 or "
-           "MxE4m3 activation";
-  } else {
-    TVM_FFI_LOG_AND_THROW(NotImplementedError) << "Unsupported act dtype.";
+        << "output2_scales_scalar is required for FP8 MoE";
+    TVM_FFI_ICHECK_EQ(output2_scales_scalar.value().dtype(), dl_float32)
+        << "output2_scales_scalar must be float.";
+    TVM_FFI_ICHECK_EQ(output2_scales_scalar.value().ndim(), 1)
+        << "output2_scales_scalar must be 1D.";
+    TVM_FFI_ICHECK_EQ(output2_scales_scalar.value().size(0), args->local_num_experts)
+        << "output2_scales_scalar has incorrect dim 0.";
+
+    TVM_FFI_ICHECK(hidden_states.dtype() == dl_float8_e4m3fn ||
+                   hidden_states.dtype() == dl_float16 || hidden_states.dtype() == dl_bfloat16)
+        << "FP8 MoE: hidden_states must be float8_e4m3fn, float16, or bfloat16.";
+    TVM_FFI_ICHECK_EQ(gemm1_weights.dtype(), dl_float8_e4m3fn)
+        << "FP8 MoE: gemm1_weights must be float8_e4m3fn.";
+    TVM_FFI_ICHECK_EQ(gemm2_weights.dtype(), dl_float8_e4m3fn)
+        << "FP8 MoE: gemm2_weights must be float8_e4m3fn.";
   }
 
-  if (dtype_act == btg::Dtype::E4m3) {
-    TVM_FFI_ICHECK(output1_scales_scalar.has_value())
-        << "output1_scales_scalar is required for E4m3 activation";
-    TVM_FFI_ICHECK(output1_scales_gate_scalar.has_value())
-        << "output1_scales_gate_scalar is required for E4m3 activation";
-    TVM_FFI_ICHECK(output2_scales_scalar.has_value())
-        << "output2_scales_scalar is required for E4m3 activation";
+  void prepare_moe(int64_t& moe_tactic) override {
+    FusedMoeLauncher::prepare_moe_common(moe_tactic);
+
+    int32_t max_num_padded_tokens_gemm1 = workspace.total_max_padded_tokens + args->num_experts;
+    int32_t max_num_padded_tokens_gemm2 = workspace.total_max_padded_tokens;
+
+    gemm1_output = alloc_tensor({max_num_padded_tokens_gemm1, 2 * args->intermediate_size},
+                                dl_uint8, hidden_states.device());
+    gemm1_output_scale =
+        alloc_tensor({2 * args->intermediate_size / 128, max_num_padded_tokens_gemm1}, dl_float32,
+                     hidden_states.device());
+
+    activation_output = alloc_tensor({max_num_padded_tokens_gemm1, args->intermediate_size},
+                                     dl_uint8, hidden_states.device());
+    activation_output_scale =
+        alloc_tensor({args->intermediate_size / 128, max_num_padded_tokens_gemm1}, dl_float32,
+                     hidden_states.device());
+
+    gemm2_output = alloc_tensor({max_num_padded_tokens_gemm2, args->hidden_size}, dl_bfloat16,
+                                hidden_states.device());
+
+    workspace.hidden_states_scale_linear = nullptr;
+    workspace.gemm1_output = gemm1_output.data_ptr();
+    workspace.gemm1_output_scale = static_cast<float*>(gemm1_output_scale.data_ptr());
+    workspace.activation_output = activation_output.data_ptr();
+    workspace.activation_output_scale = static_cast<float*>(activation_output_scale.data_ptr());
+    workspace.gemm2_output = gemm2_output.data_ptr();
+    workspace.gemm2_output_scale = nullptr;
+
+    output =
+        alloc_tensor({args->num_tokens, args->hidden_size}, dl_bfloat16, hidden_states.device());
+    args->output = output.data_ptr();
+    args->output_scale = nullptr;
+    args->do_finalize = true;  // FP8 per-tensor scale always finalizes
+
+    // Set scale pointers
+    TVM_FFI_ICHECK(output1_scales_scalar.has_value());
+    TVM_FFI_ICHECK(output1_scales_gate_scalar.has_value());
+    TVM_FFI_ICHECK(output2_scales_scalar.has_value());
+
+    args->output1_scales_scalar = static_cast<float*>(output1_scales_scalar.value().data_ptr());
+    args->output1_scales_gate_scalar =
+        static_cast<float*>(output1_scales_gate_scalar.value().data_ptr());
+    args->output2_scales_scalar = static_cast<float*>(output2_scales_scalar.value().data_ptr());
   }
 
-  if (routing_logits.has_value()) {
-    TVM_FFI_ICHECK(routing_logits.value().dtype() == dl_float32 ||
-                   routing_logits.value().dtype() == dl_bfloat16)
-        << "routing_logits must be float or bfloat16.";
-    TVM_FFI_ICHECK_EQ(routing_logits.value().ndim(), 2) << "routing_logits must be 2D.";
-    TVM_FFI_ICHECK_EQ(routing_logits.value().size(1), num_experts)
-        << "routing_logits has incorrect shape.";
+ private:
+  bool use_routing_scales_on_input;
+  Tensor gemm1_output_scale;
+  Tensor activation_output_scale;
+
+ public:
+  static Array<Array<int64_t>> getValidConfigs(int64_t top_k, int64_t hidden_size,
+                                               int64_t intermediate_size, int64_t num_local_experts,
+                                               int64_t num_tokens, int64_t gated_act_type,
+                                               bool use_shuffled_weight, int64_t weight_layout,
+                                               btg::Dtype dtype_act, btg::Dtype dtype_weights) {
+    Array<Array<int64_t>> valid_configs;
+
+    std::vector<int32_t> supported_tile_nums(mSupportedTileNums.begin(), mSupportedTileNums.end());
+    std::set<int32_t> selected_tile_nums =
+        computeSelectedTileN(supported_tile_nums, num_tokens, top_k, num_local_experts);
+
+    for (int32_t tile_N : selected_tile_nums) {
+      auto moe_runner = std::make_unique<tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner>(
+          dtype_act, dtype_weights,
+          false,  // useDeepSeekFp8
+          tile_N, static_cast<GatedActType>(gated_act_type), use_shuffled_weight,
+          static_cast<batchedGemm::gemm::MatrixLayout>(weight_layout));
+
+      auto cfgs = moe_runner->getValidConfigIndices(top_k, hidden_size, intermediate_size,
+                                                    num_local_experts, num_tokens);
+
+      for (auto cfg : cfgs) {
+        valid_configs.push_back({tile_N, cfg});
+      }
+    }
+
+    return valid_configs;
   }
-  if (routing_bias.has_value()) {
-    TVM_FFI_ICHECK(routing_bias.value().dtype() == dl_bfloat16 ||
-                   routing_bias.value().dtype() == dl_float32)
-        << "routing_bias must be bfloat16 or float.";
+};
+
+class Fp8BlockScaleLauncher : public FusedMoeLauncher {
+ public:
+  static constexpr std::array<int32_t, 5> mSupportedTileNums = {8, 16, 32, 64, 128};
+
+  Fp8BlockScaleLauncher(TensorView const& routing_logits, Optional<TensorView> const& routing_bias,
+                        TensorView const& hidden_states, TensorView const& hidden_states_scale,
+                        TensorView const& gemm1_weights, TensorView const& gemm1_weights_scale,
+                        TensorView const& gemm2_weights, TensorView const& gemm2_weights_scale)
+      : FusedMoeLauncher(Optional<TensorView>(routing_logits), routing_bias, hidden_states,
+                         gemm1_weights, Optional<TensorView>(), Optional<TensorView>(),
+                         gemm2_weights, Optional<TensorView>()),
+        hidden_states_scale(hidden_states_scale),
+        gemm1_weights_scale(gemm1_weights_scale),
+        gemm2_weights_scale(gemm2_weights_scale) {}
+
+  void init(std::unique_ptr<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>&& args,
+            int64_t tile_tokens_dim, int64_t routing_method_type, bool use_shuffled_weight,
+            int64_t weight_layout) {
+    constexpr int64_t gated_act_type = static_cast<int64_t>(GatedActType::SwiGlu);
+
+    mDtypeAct = btg::Dtype::E4m3;
+    mDtypeWeights = btg::Dtype::E4m3;
+
+    auto dtype = hidden_states.dtype();
+    if (dtype == dl_float16) {
+      args->mDtypeElt = btg::Dtype::Fp16;
+    } else if (dtype == dl_bfloat16) {
+      args->mDtypeElt = btg::Dtype::Bfloat16;
+    } else if (dtype == dl_float8_e4m3fn) {
+      args->mDtypeElt = btg::Dtype::E4m3;
+    } else {
+      TVM_FFI_LOG_AND_THROW(NotImplementedError) << "Unsupported input dtype for MoE.";
+    }
 
-    TVM_FFI_ICHECK_EQ(routing_bias.value().ndim(), 1) << "routing_bias must be 1D.";
-    TVM_FFI_ICHECK_EQ(routing_bias.value().size(0), num_experts)
-        << "routing_bias has incorrect shape.";
+    // Output is always bfloat16 for FP8 block scale
+    args->mDtypeOut = btg::Dtype::Bfloat16;
+
+    FusedMoeLauncher::init_common(std::move(args), tile_tokens_dim, routing_method_type,
+                                  use_shuffled_weight, weight_layout, gated_act_type);
   }
 
-  if (n_group.value_or(0) != 0) {
-    TVM_FFI_ICHECK(static_cast<RoutingMethodType>(routing_method_type) ==
-                   RoutingMethodType::DeepSeekV3)
-        << "Routing kernel with groups implies DeepSeekV3 routing method.";
-    TVM_FFI_ICHECK(topk_group.has_value()) << "if n_group is given, topk_group must be given";
-    TVM_FFI_ICHECK_EQ(num_experts % n_group.value(), 0)
-        << "num_experts must be divisible by n_group";
-    TVM_FFI_ICHECK(top_k <= 10 && top_k > 0)
-        << "Current routing kernel (with groups) only supports top_k<=10 && top_k>0.";
-    TVM_FFI_ICHECK(topk_group.value() <= 4 && topk_group.value() > 0)
-        << "Current routing kernel only (with groups) supports topk_group<=4 && topk_group > 0.";
-    TVM_FFI_ICHECK_LE(topk_group.value(), n_group.value())
-        << "n_group must not be smaller than topk_group.";
-    // This check ensures we have enough experts in the selected groups to handle the top_k routing
-    TVM_FFI_ICHECK_LT(top_k, (topk_group.value() * num_experts / n_group.value()))
-        << "top_k must be less than total number of experts in selected groups";
-  } else if (static_cast<RoutingMethodType>(routing_method_type) ==
-                 RoutingMethodType::Renormalize ||
-             static_cast<RoutingMethodType>(routing_method_type) ==
-                 RoutingMethodType::RenormalizeNaive ||
-             static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::TopK) {
-    TVM_FFI_ICHECK(top_k <= 10 && top_k > 0)
-        << "Current routing kernel (no groups, renormalize/topk) only supports top_k<=10 && "
-           "top_k>0.";
-  } else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Llama4) {
-    TVM_FFI_ICHECK_EQ(top_k, 1)
-        << "Current routing kernel (no groups, Llama4) only supports top_k=1.";
+  void check_routing() const override {
+    FusedMoeLauncher::check_routing_common();
+
+    if (args->n_group != 0) {
+      TVM_FFI_ICHECK(static_cast<RoutingMethodType>(routing_method_type) ==
+                     RoutingMethodType::DeepSeekV3)
+          << "Routing kernel with groups implies DeepSeekV3 routing method.";
+      TVM_FFI_ICHECK(args->topk_group != 0) << "if n_group is given, topk_group must be given";
+      TVM_FFI_ICHECK_EQ(args->num_experts % args->n_group, 0)
+          << "num_experts must be divisible by n_group";
+      TVM_FFI_ICHECK(args->top_k <= 8 && args->top_k > 0)
+          << "Current routing kernel (with groups) only supports top_k<=8 && top_k>0.";
+      TVM_FFI_ICHECK(args->topk_group <= 4 && args->topk_group > 0)
+          << "Current routing kernel only (with groups) supports topk_group<=4 && topk_group > 0.";
+      TVM_FFI_ICHECK_LE(args->topk_group, args->n_group)
+          << "n_group must not be smaller than topk_group.";
+      TVM_FFI_ICHECK_LT(args->top_k, (args->topk_group * args->num_experts / args->n_group))
+          << "top_k must be less than total number of experts in selected groups";
+    } else if (static_cast<RoutingMethodType>(routing_method_type) ==
+                   RoutingMethodType::Renormalize ||
+               static_cast<RoutingMethodType>(routing_method_type) ==
+                   RoutingMethodType::RenormalizeNaive) {
+      TVM_FFI_ICHECK(args->top_k <= 10 && args->top_k > 0)
+          << "Current routing kernel (no groups, renormalize) only supports top_k<=10 && top_k>0.";
+    } else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Llama4) {
+      TVM_FFI_ICHECK_EQ(args->top_k, 1)
+          << "Current routing kernel (no groups, Llama4) only supports top_k=1.";
+    }
+    TVM_FFI_ICHECK_EQ(args->num_experts % 4, 0)
+        << "Routing kernel expects that num_experts must be divisible by 4";
+    TVM_FFI_ICHECK_GT(args->num_experts, args->top_k) << "num_experts must be greater than top_k";
+    TVM_FFI_ICHECK_LE(args->local_num_experts + args->local_expert_offset, args->num_experts)
+        << "num_experts must be greater or equal to local_num_experts + local_expert_offset";
   }
 
-  TVM_FFI_ICHECK_EQ(num_experts % 4, 0)
-      << "Routing kernel expects that num_experts must be divisible by 4";
-  TVM_FFI_ICHECK_GT(num_experts, top_k) << "num_experts must be greater than top_k";
+  void prepare_routing() override {
+    FusedMoeLauncher::prepare_routing_common();
 
-  tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs args;
-  tensorrt_llm::kernels::trtllmgen_moe::MoE::MoEWorkspace workspace;
+    auto dtype = hidden_states.dtype();
+    if (dtype == dl_float16) {
+      args->mDtypeElt = btg::Dtype::Fp16;
+    } else if (dtype == dl_bfloat16) {
+      args->mDtypeElt = btg::Dtype::Bfloat16;
+    } else if (dtype == dl_float8_e4m3fn) {
+      args->mDtypeElt = btg::Dtype::E4m3;
+    } else {
+      TVM_FFI_LOG_AND_THROW(NotImplementedError) << "Unsupported input dtype for MoE.";
+    }
 
-  // setup args
-  args.mDtypeElt = dtype_act;
-  // note: the assumption is that output data type is always Bfloat16 (the default)
-  auto routing_bias_dtype = routing_bias.has_value() ? routing_bias.value().dtype() : dl_bfloat16;
-  auto btg_routing_bias_dtype =
-      routing_bias_dtype == dl_bfloat16 ? btg::Dtype::Bfloat16 : btg::Dtype::Fp32;
-  // We shouln't use args.mDtypeExpW since it indicates the output data type of routing kernel,
-  // which is currently always bfloat16 for routing kernel while the data type of routing bias now
-  // can be fp32
-  args.routing_logits = routing_logits.has_value() ? routing_logits.value().data_ptr() : nullptr;
-  args.routing_bias = routing_bias.has_value() ? routing_bias.value().data_ptr() : nullptr;
-  args.hidden_states = hidden_states.data_ptr();
-  args.hidden_states_scale =
-      hidden_states_scale.has_value() ? hidden_states_scale.value().data_ptr() : nullptr;
-  args.gemm1_weights = gemm1_weights.data_ptr();
-  args.gemm1_weights_scale = gemm1_weights_scale.data_ptr();
-  args.gemm1_bias =
-      gemm1_bias.has_value() ? static_cast<float*>(gemm1_bias.value().data_ptr()) : nullptr;
-  args.gemm1_alpha =
-      gemm1_alpha.has_value() ? static_cast<float*>(gemm1_alpha.value().data_ptr()) : nullptr;
-  args.gemm1_beta =
-      gemm1_beta.has_value() ? static_cast<float*>(gemm1_beta.value().data_ptr()) : nullptr;
-  args.gemm1_clamp_limit = gemm1_clamp_limit.has_value()
-                               ? static_cast<float*>(gemm1_clamp_limit.value().data_ptr())
-                               : nullptr;
-  args.gemm2_weights = gemm2_weights.data_ptr();
-  args.gemm2_weights_scale = gemm2_weights_scale.data_ptr();
-  args.gemm2_bias =
-      gemm2_bias.has_value() ? static_cast<float*>(gemm2_bias.value().data_ptr()) : nullptr;
-  args.num_tokens = hidden_states.size(0);
-  args.num_experts = num_experts;
-  // * 2 to compensate for the fact that sizeof(hidden_states.dtype) is 1 because we pack 2 e2m1
-  // into 1 byte.
-  auto const hidden_states_hidden_size =
-      dtype_act == btg::Dtype::E2m1 ? hidden_states.size(1) * 2 : hidden_states.size(1);
-  args.hidden_size = hidden_states_hidden_size;
-  args.hidden_size_output = args.hidden_size;
-  args.top_k = top_k;
-  args.n_group = n_group.value_or(0);
-  args.topk_group = topk_group.value_or(0);
-  args.local_expert_offset = local_expert_offset;
-  args.local_num_experts = local_num_experts;
-  args.routed_scaling_factor = routed_scaling_factor.value_or(1.0);
-  args.intermediate_size = intermediate_size;
-
-  // allocate workspace for routing kernel
-  Tensor num_tokens_per_expert = alloc_tensor({num_experts}, dl_int32, hidden_states.device());
-  int32_t max_num_padded_tokens =
-      tensorrt_llm::kernels::trtllmgen_moe::Routing::getMaxPermutedPaddedCount(
-          args.num_tokens, top_k, num_experts, tile_tokens_dim);
-  int32_t max_num_padded_tokens_gemm1 =
-      tensorrt_llm::kernels::trtllmgen_moe::Routing::maybeGetMinTokenCount(
-          max_num_padded_tokens, args.intermediate_size, btg::dtypeGetNumBits(args.mDtypeElt));
-  int32_t max_num_padded_tokens_gemm2 =
-      tensorrt_llm::kernels::trtllmgen_moe::Routing::maybeGetMinTokenCount(
-          max_num_padded_tokens, args.hidden_size, btg::dtypeGetNumBits(args.mDtypeOut));
-  Tensor total_num_padded_tokens = alloc_tensor({1}, dl_int32, hidden_states.device());
-  Tensor expanded_idx_to_permuted_idx =
-      alloc_tensor({args.num_tokens, args.top_k}, dl_int32, hidden_states.device());
-
-  Tensor permuted_idx_to_token_idx =
-      alloc_tensor({max_num_padded_tokens}, dl_int32, hidden_states.device());
-  // Tensor expert_weights = alloc_tensor(
-  //     {args.num_tokens, args.top_k}, dl_bfloat16, hidden_states.device());
-  // Tensor expert_indexes = alloc_tensor(
-  //     {args.num_tokens, args.top_k}, dl_int32, hidden_states.device();
-  int64_t const size_of_expert_count_histogram = std::max(num_experts * 2, int64_t(256 * 2));
-  Tensor expert_count_histogram =
-      alloc_tensor({size_of_expert_count_histogram}, dl_int32, hidden_states.device());
-
-  auto const sf_vec_size = dtype_weights == btg::Dtype::MxE2m1 ? 32 : 16;
-
-  // allocate workspace for activation/gemm/finalize kernels
-  auto const gemm1_output_hidden =
-      dtype_act == btg::Dtype::E2m1 ? intermediate_size / 2 : intermediate_size;
-  // Tensor gemm1_output = alloc_tensor(
-  //     {max_num_padded_tokens, gemm1_output_hidden},
-  //     dtype_act == btg::Dtype::Bfloat16 ? dl_bfloat16 : dl_float8_e4m3fn,
-  //     hidden_states.device());
-  Tensor gemm1_output = alloc_tensor({max_num_padded_tokens_gemm1, gemm1_output_hidden},
-                                     dtype_act == btg::Dtype::Bfloat16 ? dl_bfloat16 : dl_uint8,
-                                     hidden_states.device());
+    args->mUseDeepSeekFp8 = true;
+    args->routing_logits = static_cast<float*>(routing_logits.value().data_ptr());
+    // Set expert weights dtype based on routing bias
+    auto const routing_bias_dtype =
+        routing_bias.has_value() ? routing_bias.value().dtype() : dl_bfloat16;
+    mRoutingBiasDtype = routing_bias_dtype == dl_bfloat16 ? btg::Dtype::Bfloat16 : btg::Dtype::Fp32;
 
-  Optional<Tensor> gemm1_output_scale = std::nullopt;
-  if (dtype_act == btg::Dtype::E2m1 || dtype_act == btg::Dtype::MxE4m3) {
-    int64_t sf_size = tensorrt_llm::computeSwizzledLayoutSFSize(max_num_padded_tokens_gemm1,
-                                                                intermediate_size / sf_vec_size);
-    // gemm1_output_scale = alloc_tensor({sf_size}, dl_float8_e4m3fn, hidden_states.device());
-    gemm1_output_scale = alloc_tensor({sf_size}, dl_uint8, hidden_states.device());
+    expert_weights =
+        alloc_tensor({args->num_tokens, args->top_k}, dl_bfloat16, hidden_states.device());
+    workspace.expert_weights = expert_weights.data_ptr();
   }
 
-  Tensor gemm2_output = alloc_tensor({max_num_padded_tokens_gemm2, args.hidden_size}, dl_bfloat16,
-                                     hidden_states.device());
+  void check_moe() const override {
+    FusedMoeLauncher::check_moe_common();
 
-  int32_t max_num_ctas = tensorrt_llm::kernels::trtllmgen_moe::Routing::getMaxNumCtasInBatchDim(
-      args.num_tokens, args.top_k, args.num_experts, tile_tokens_dim);
-  Tensor cta_idx_xy_to_batch_idx = alloc_tensor({max_num_ctas}, dl_int32, hidden_states.device());
-  Tensor cta_idx_xy_to_mn_limit = alloc_tensor({max_num_ctas}, dl_int32, hidden_states.device());
-  Tensor num_non_exiting_ctas = alloc_tensor({1}, dl_int32, hidden_states.device());
-
-  //
-  // TopK routing
-  //
-
-  tensorrt_llm::kernels::trtllmgen_moe::Routing::Runner routing_runner(tile_tokens_dim);
-  cudaStream_t stream = get_stream(hidden_states.device());
-  routing_runner.run(
-      args.routing_logits, args.routing_bias, args.num_tokens, args.num_experts, args.top_k,
-      args.n_group, args.topk_group, args.local_expert_offset, args.local_num_experts,
-      args.routed_scaling_factor, static_cast<int*>(expert_indices.data_ptr()),
-      static_cast<int*>(expert_count_histogram.data_ptr()),
-      static_cast<int*>(total_num_padded_tokens.data_ptr()),
-      static_cast<int*>(expanded_idx_to_permuted_idx.data_ptr()),
-      nullptr, /*static_cast<int*>(permuted_idx_to_expanded_idx.data_ptr()),*/
-      static_cast<int*>(permuted_idx_to_token_idx.data_ptr()), expert_weights.data_ptr(),
-      static_cast<int*>(num_tokens_per_expert.data_ptr()),
-      static_cast<int*>(cta_idx_xy_to_batch_idx.data_ptr()),
-      static_cast<int*>(cta_idx_xy_to_mn_limit.data_ptr()),
-      static_cast<int*>(num_non_exiting_ctas.data_ptr()), args.mDtypeElt, btg_routing_bias_dtype,
-      false /* use_routing_scales_on_input */, false /* use_deep_seek_fp8 */,
-      static_cast<RoutingMethodType>(routing_method_type), stream);
-
-  //
-  // FC13 (gemm1) + FC2 (gemm2)
-  //
-
-  if (dtype_act == btg::Dtype::E2m1) {
-    TVM_FFI_ICHECK_EQ(hidden_states.dtype(), dl_uint8) << "hidden_states must be byte.";
-  } else if (dtype_act == btg::Dtype::E4m3 || dtype_act == btg::Dtype::MxE4m3) {
     TVM_FFI_ICHECK_EQ(hidden_states.dtype(), dl_float8_e4m3fn) << "hidden_states must be fp8.";
-  } else if (dtype_act == btg::Dtype::Bfloat16) {
-    TVM_FFI_ICHECK_EQ(hidden_states.dtype(), dl_bfloat16) << "hidden_states must be bfloat16.";
-  } else {
-    TVM_FFI_LOG_AND_THROW(NotImplementedError) << "Unsupported act dtype.";
+    TVM_FFI_ICHECK_EQ(hidden_states_scale.dtype(), dl_float32)
+        << "hidden_states_scale must be float.";
+    TVM_FFI_ICHECK_EQ(hidden_states_scale.ndim(), 2) << "hidden_states_scale must be 2D.";
+    TVM_FFI_ICHECK_EQ(hidden_states_scale.size(0), hidden_states.size(1) / 128)
+        << "hidden_states_scale dim0 must match hidden_states dim1 / 128.";
+    TVM_FFI_ICHECK_EQ(hidden_states_scale.size(1), args->num_tokens)
+        << "hidden_states_scale dim1 must match num_tokens.";
+
+    TVM_FFI_ICHECK_EQ(gemm1_weights.dtype(), dl_float8_e4m3fn) << "gemm1_weights must be fp8.";
+    TVM_FFI_ICHECK_EQ(gemm2_weights.dtype(), dl_float8_e4m3fn) << "gemm2_weights must be fp8.";
+
+    TVM_FFI_ICHECK_EQ(gemm1_weights_scale.dtype(), dl_float32)
+        << "gemm1_weights_scale must be float.";
+    TVM_FFI_ICHECK_EQ(gemm1_weights_scale.ndim(), 3) << "gemm1_weights_scale must be 3D.";
+    TVM_FFI_ICHECK_EQ(gemm1_weights_scale.size(0), args->local_num_experts)
+        << "gemm1_weights_scale has incorrect shape.";
+    TVM_FFI_ICHECK_EQ(args->intermediate_size % 128, 0)
+        << "intermediate_size must be a multiple of 128.";
+    TVM_FFI_ICHECK_EQ(gemm1_weights_scale.size(1), 2 * args->intermediate_size / 128)
+        << "gemm1_weights_scale has incorrect shape.";
+    TVM_FFI_ICHECK_EQ(gemm1_weights_scale.size(2), args->hidden_size / 128)
+        << "gemm1_weights_scale has incorrect shape.";
+
+    TVM_FFI_ICHECK_EQ(gemm2_weights_scale.dtype(), dl_float32)
+        << "gemm2_weights_scale must be float.";
+    TVM_FFI_ICHECK_EQ(gemm2_weights_scale.ndim(), 3) << "gemm2_weights_scale must be 3D.";
+    TVM_FFI_ICHECK_EQ(gemm2_weights_scale.size(0), args->local_num_experts)
+        << "gemm2_weights_scale has incorrect shape.";
+    TVM_FFI_ICHECK_EQ(gemm2_weights_scale.size(1), args->hidden_size / 128)
+        << "gemm2_weights_scale has incorrect shape.";
+    TVM_FFI_ICHECK_EQ(gemm2_weights_scale.size(2), args->intermediate_size / 128)
+        << "gemm2_weights_scale has incorrect shape.";
+
+    check_weights_shape("gemm1");
+    check_weights_shape("gemm2");
+    TVM_FFI_ICHECK_EQ(args->intermediate_size % 128, 0)
+        << "intermediate_size must be a multiple of 128.";
   }
 
-  if (hidden_states_scale.has_value()) {
-    TVM_FFI_ICHECK_EQ(hidden_states_scale.value().dtype(), dl_float8_e4m3fn)
-        << "hidden_states_scale must be fp8.";
+  void prepare_moe(int64_t& moe_tactic) override {
+    FusedMoeLauncher::prepare_moe_common(moe_tactic);
+
+    // Calculate max_num_padded_tokens for gemm1 and gemm2 using maybeGetMinTokenCount
+    int32_t max_num_padded_tokens_gemm1 =
+        tensorrt_llm::kernels::trtllmgen_moe::Routing::maybeGetMinTokenCount(
+            workspace.total_max_padded_tokens, args->intermediate_size,
+            btg::dtypeGetNumBits(args->mDtypeElt));
+    int32_t max_num_padded_tokens_gemm2 =
+        tensorrt_llm::kernels::trtllmgen_moe::Routing::maybeGetMinTokenCount(
+            workspace.total_max_padded_tokens, args->hidden_size,
+            btg::dtypeGetNumBits(args->mDtypeOut));
+
+    gemm1_output = alloc_tensor({max_num_padded_tokens_gemm1, 2 * args->intermediate_size},
+                                dl_uint8, hidden_states.device());
+    gemm1_output_scale =
+        alloc_tensor({2 * args->intermediate_size / 128, workspace.total_max_padded_tokens},
+                     dl_float32, hidden_states.device());
+
+    activation_output = alloc_tensor({max_num_padded_tokens_gemm1, args->intermediate_size},
+                                     dl_uint8, hidden_states.device());
+    activation_output_scale =
+        alloc_tensor({args->intermediate_size / 128, max_num_padded_tokens_gemm1}, dl_float32,
+                     hidden_states.device());
+
+    gemm2_output = alloc_tensor({max_num_padded_tokens_gemm2, args->hidden_size}, dl_bfloat16,
+                                hidden_states.device());
+
+    workspace.hidden_states_scale_linear = nullptr;
+    workspace.gemm1_output = gemm1_output.data_ptr();
+    workspace.gemm1_output_scale = static_cast<float*>(gemm1_output_scale.data_ptr());
+    workspace.activation_output = activation_output.data_ptr();
+    workspace.activation_output_scale = static_cast<float*>(activation_output_scale.data_ptr());
+    workspace.gemm2_output = gemm2_output.data_ptr();
+    workspace.gemm2_output_scale = nullptr;
+
+    output =
+        alloc_tensor({args->num_tokens, args->hidden_size}, dl_bfloat16, hidden_states.device());
+    args->output = output.data_ptr();
+    args->output_scale = nullptr;
+    args->do_finalize = true;
+
+    args->hidden_states_scale = static_cast<float*>(hidden_states_scale.data_ptr());
+    args->gemm1_weights_scale = static_cast<float*>(gemm1_weights_scale.data_ptr());
+    args->gemm2_weights_scale = static_cast<float*>(gemm2_weights_scale.data_ptr());
+  }
+
+ private:
+  TensorView hidden_states_scale;
+  TensorView gemm1_weights_scale;
+  TensorView gemm2_weights_scale;
+  Tensor gemm1_output_scale;
+  Tensor activation_output_scale;
+
+ public:
+  static Array<Array<int64_t>> getValidConfigs(int64_t top_k, int64_t hidden_size,
+                                               int64_t intermediate_size, int64_t num_local_experts,
+                                               int64_t num_tokens, bool use_shuffled_weight,
+                                               int64_t weight_layout, btg::Dtype dtype_weights) {
+    Array<Array<int64_t>> valid_configs;
+
+    std::vector<int32_t> supported_tile_nums(mSupportedTileNums.begin(), mSupportedTileNums.end());
+    std::set<int32_t> selected_tile_nums =
+        computeSelectedTileN(supported_tile_nums, num_tokens, top_k, num_local_experts);
+
+    for (int32_t tile_N : selected_tile_nums) {
+      auto moe_runner = std::make_unique<tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner>(
+          dtype_weights,  // dtype_weights for DeepSeek FP8
+          true,           // useDeepSeekFp8
+          tile_N, use_shuffled_weight, static_cast<batchedGemm::gemm::MatrixLayout>(weight_layout));
+
+      auto cfgs = moe_runner->getValidConfigIndices(top_k, hidden_size, intermediate_size,
+                                                    num_local_experts, num_tokens);
+
+      for (auto cfg : cfgs) {
+        valid_configs.push_back({tile_N, cfg});
+      }
+    }
+
+    return valid_configs;
+  }
+};
+
+class MxInt4BlockScaleLauncher : public FusedMoeLauncher {
+ public:
+  static constexpr std::array<int32_t, 5> mSupportedTileNums = {8, 16, 32, 64, 128};
+
+  MxInt4BlockScaleLauncher(TensorView const& routing_logits,
+                           Optional<TensorView> const& routing_bias,
+                           TensorView const& hidden_states, TensorView const& gemm1_weights,
+                           TensorView const& gemm1_weights_scale,
+                           Optional<TensorView> const& gemm1_alpha,
+                           Optional<TensorView> const& gemm1_beta,
+                           Optional<TensorView> const& gemm1_clamp_limit,
+                           TensorView const& gemm2_weights, TensorView const& gemm2_weights_scale)
+      : FusedMoeLauncher(Optional<TensorView>(routing_logits), routing_bias, hidden_states,
+                         gemm1_weights, Optional<TensorView>(), Optional<TensorView>(),
+                         gemm2_weights, Optional<TensorView>()),
+        gemm1_weights_scale(gemm1_weights_scale),
+        gemm2_weights_scale(gemm2_weights_scale) {}
+
+  void init(std::unique_ptr<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>&& args,
+            int64_t tile_tokens_dim, int64_t routing_method_type) {
+    // currently only support mxint4 x bf16
+    auto dtype = hidden_states.dtype();
+    if (dtype == dl_bfloat16) {
+      args->mDtypeElt = btg::Dtype::Bfloat16;
+    } else {
+      TVM_FFI_LOG_AND_THROW(NotImplementedError) << "Unsupported input dtype for MoE.";
+    }
+    args->mDtypeOut = btg::Dtype::Bfloat16;
 
-    TVM_FFI_ICHECK_EQ(
-        hidden_states_scale.value().numel(),
-        tensorrt_llm::computeLinearLayoutSFSize(args.num_tokens, args.hidden_size / sf_vec_size))
-        << "hidden_states_scale has incorrect size";
+    mDtypeAct = btg::Dtype::Bfloat16;
+    mDtypeWeights = btg::Dtype::MxInt4;
+
+    FusedMoeLauncher::init_common(
+        std::move(args), tile_tokens_dim, routing_method_type,
+        /*use_shuffled_weight=*/true,
+        static_cast<int64_t>(batchedGemm::gemm::MatrixLayout::BlockMajorK),
+        static_cast<int64_t>(GatedActType::SwiGlu));
   }
 
-  TVM_FFI_ICHECK_EQ(gemm1_weights.dtype(), dl_uint8) << "gemm1_weights must be byte.";
-
-  TVM_FFI_ICHECK_EQ(gemm1_weights.ndim(), 3) << "gemm1_weights must be 3D.";
-  TVM_FFI_ICHECK_EQ(gemm1_weights.size(1) % 2, 0)
-      << "the second dimension of weights must be even.";
-  TVM_FFI_ICHECK_EQ(intermediate_size, gemm1_weights.size(1) / 2)
-      << "intermediate_size has incorrect dim 1.";
-  // This check passes even though the actual shape of the weights[2] and hidden_states[1] is
-  // 2 times larger due to the fact that 2 e2m1 are packed into 1 byte.
-  TVM_FFI_ICHECK_EQ(
-      gemm1_weights.size(2),
-      (dtype_act == btg::Dtype::E2m1 ? hidden_states.size(1) : hidden_states.size(1) / 2))
-      << "the third dimension of weights must be equal to hidden_size.";
-
-  TVM_FFI_ICHECK_EQ(gemm1_weights_scale.dtype(), dl_float8_e4m3fn)
-      << "gemm1_weights_scale must be fp8.";
-
-  TVM_FFI_ICHECK_EQ(gemm1_weights_scale.ndim(), 3) << "gemm1_weights_scale must be 3D.";
-  TVM_FFI_ICHECK_EQ(gemm1_weights_scale.size(0), local_num_experts)
-      << "gemm1_weights_scale has incorrect dim 0.";
-  TVM_FFI_ICHECK_EQ(intermediate_size % sf_vec_size, 0)
-      << "the second dimension of weights must be a multiple of ",
-      sf_vec_size;
-  TVM_FFI_ICHECK_EQ(gemm1_weights_scale.size(1), 2 * intermediate_size)
-      << "gemm1_weights_scale has incorrect dim 1.";
-  TVM_FFI_ICHECK_EQ(gemm1_weights_scale.size(2), args.hidden_size / sf_vec_size)
-      << "gemm1_weights_scale has incorrect dim 2.";
-
-  if (gemm1_bias.has_value()) {
-    TVM_FFI_ICHECK_EQ(gemm1_bias.value().dtype(), dl_float32)
-        << "gemm1_bias must be float, got "
-        << tvm::ffi::DLDataTypeToString(gemm1_bias.value().dtype());
-    TVM_FFI_ICHECK_EQ(gemm1_bias.value().ndim(), 2) << "gemm1_bias must be 2D.";
-    TVM_FFI_ICHECK_EQ(gemm1_bias.value().size(0), local_num_experts)
-        << "gemm1_bias has incorrect dim 0.";
-    TVM_FFI_ICHECK_EQ(gemm1_bias.value().size(1), 2 * intermediate_size)
-        << "gemm1_bias has incorrect dim 1.";
+  void check_routing() const override { FusedMoeLauncher::check_routing_common(); }
+
+  void prepare_routing() override {
+    FusedMoeLauncher::prepare_routing_common();
+
+    args->mDtypeElt = mDtypeAct;
+    args->mUseDeepSeekFp8 = false;
+    // Set expert weights dtype based on routing bias
+    auto const routing_bias_dtype =
+        routing_bias.has_value() ? routing_bias.value().dtype() : dl_bfloat16;
+    mRoutingBiasDtype = routing_bias_dtype == dl_bfloat16 ? btg::Dtype::Bfloat16 : btg::Dtype::Fp32;
+
+    expert_weights =
+        alloc_tensor({args->num_tokens, args->top_k}, dl_bfloat16, hidden_states.device());
+
+    workspace.expert_weights = expert_weights.data_ptr();
   }
 
-  if (gemm1_alpha.has_value()) {
-    TVM_FFI_ICHECK_EQ(gemm1_alpha.value().dtype(), dl_float32)
-        << "gemm1_alpha must be float, got "
-        << tvm::ffi::DLDataTypeToString(gemm1_alpha.value().dtype());
-    TVM_FFI_ICHECK_EQ(gemm1_alpha.value().ndim(), 1) << "gemm1_alpha must be 1D.";
-    TVM_FFI_ICHECK_EQ(gemm1_alpha.value().size(0), local_num_experts)
-        << "gemm1_alpha has incorrect dim 0.";
+  void check_moe() const override {
+    TVM_FFI_ICHECK(mDtypeAct == btg::Dtype::Bfloat16)
+        << "Only Bfloat16 is supported by MxInt4 block scale MoE";
+
+    TVM_FFI_ICHECK_EQ(gemm1_weights.dtype(), dl_uint8) << "gemm1_weights must be uint8.";
+    TVM_FFI_ICHECK_EQ(gemm1_weights_scale.dtype(), dl_bfloat16)
+        << "gemm1_weights_scale must be bf16.";
+    TVM_FFI_ICHECK_EQ(gemm2_weights.dtype(), dl_uint8) << "gemm2_weights must be uint8.";
+    TVM_FFI_ICHECK_EQ(gemm2_weights_scale.dtype(), dl_bfloat16)
+        << "gemm2_weights_scale must be bf16.";
   }
-  if (gemm1_beta.has_value()) {
-    TVM_FFI_ICHECK_EQ(gemm1_beta.value().dtype(), dl_float32)
-        << "gemm1_beta must be float, got "
-        << tvm::ffi::DLDataTypeToString(gemm1_beta.value().dtype());
-    TVM_FFI_ICHECK_EQ(gemm1_beta.value().ndim(), 1) << "gemm1_beta must be 1D.";
-    TVM_FFI_ICHECK_EQ(gemm1_beta.value().size(0), local_num_experts)
-        << "gemm1_beta has incorrect dim 0.";
+
+  void prepare_moe(int64_t& moe_tactic) override {
+    args->hidden_states = hidden_states.data_ptr();
+    args->hidden_states_scale = nullptr;
+    args->gemm1_weights = gemm1_weights.data_ptr();
+    args->gemm1_weights_scale = gemm1_weights_scale.data_ptr();
+    args->gemm1_alpha =
+        gemm1_alpha.has_value() ? static_cast<float*>(gemm1_alpha.value().data_ptr()) : nullptr;
+    args->gemm1_beta =
+        gemm1_beta.has_value() ? static_cast<float*>(gemm1_beta.value().data_ptr()) : nullptr;
+    args->gemm1_clamp_limit = gemm1_clamp_limit.has_value()
+                                  ? static_cast<float*>(gemm1_clamp_limit.value().data_ptr())
+                                  : nullptr;
+    args->gemm2_weights = gemm2_weights.data_ptr();
+    args->gemm2_weights_scale = gemm2_weights_scale.data_ptr();
+    args->output1_scales_scalar = nullptr;
+    args->output1_scales_gate_scalar = nullptr;
+    args->output2_scales_scalar = nullptr;
+
+    FusedMoeLauncher::prepare_moe_common(moe_tactic);
+
+    max_num_padded_tokens_gemm1 =
+        tensorrt_llm::kernels::trtllmgen_moe::Routing::maybeGetMinTokenCount(
+            workspace.total_max_padded_tokens, args->intermediate_size,
+            btg::dtypeGetNumBits(mDtypeAct));
+    max_num_padded_tokens_gemm2 =
+        tensorrt_llm::kernels::trtllmgen_moe::Routing::maybeGetMinTokenCount(
+            workspace.total_max_padded_tokens, args->hidden_size,
+            btg::dtypeGetNumBits(btg::Dtype::Bfloat16));  // Output is always BF16
+
+    auto const gemm1_output_hidden = args->intermediate_size;
+    gemm1_output = alloc_tensor({max_num_padded_tokens_gemm1, gemm1_output_hidden}, dl_bfloat16,
+                                hidden_states.device());
+
+    // Allocate gemm2_output
+    gemm2_output = alloc_tensor({max_num_padded_tokens_gemm2, args->hidden_size}, dl_bfloat16,
+                                hidden_states.device());
+
+    // Setup workspace pointers
+    workspace.hidden_states_scale_linear = nullptr;  // MxInt4 doesn't use linear scale
+    workspace.gemm1_output = gemm1_output.data_ptr();
+    workspace.gemm1_output_scale = nullptr;
+    // Note: activation_output and activation_output_scale are set by the base class
+    // prepare_moe_common() when gated activation is used
+    workspace.gemm2_output = gemm2_output.data_ptr();
+    workspace.gemm2_output_scale = nullptr;
   }
 
-  TVM_FFI_ICHECK_EQ(gemm2_weights.dtype(), dl_uint8) << "gemm2_weights must be byte.";
+ private:
+  TensorView gemm1_weights_scale;
+  Optional<TensorView> gemm1_alpha;
+  Optional<TensorView> gemm1_beta;
+  Optional<TensorView> gemm1_clamp_limit;
+  TensorView gemm2_weights_scale;
+  int32_t max_num_padded_tokens_gemm1{};
+  int32_t max_num_padded_tokens_gemm2{};
+
+ public:
+  static Array<Array<int64_t>> getValidConfigs(int64_t top_k, int64_t hidden_size,
+                                               int64_t intermediate_size, int64_t num_local_experts,
+                                               int64_t num_tokens) {
+    Array<Array<int64_t>> valid_configs;
+
+    std::vector<int32_t> tile_sizes(mSupportedTileNums.begin(), mSupportedTileNums.end());
+    std::set<int32_t> selected_tile_nums =
+        computeSelectedTileN(tile_sizes, num_tokens, top_k, num_local_experts);
+
+    for (int32_t tile_N : selected_tile_nums) {
+      auto moe_runner = std::make_unique<tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner>(
+          btg::Dtype::Bfloat16, btg::Dtype::MxInt4,
+          false,  // useDeepSeekFp8
+          tile_N, GatedActType::SwiGlu,
+          /*useShuffledMatrixA*/ true, batchedGemm::gemm::MatrixLayout::BlockMajorK);
+
+      auto cfgs = moe_runner->getValidConfigIndices(top_k, hidden_size, intermediate_size,
+                                                    num_local_experts, num_tokens);
+
+      for (auto cfg : cfgs) {
+        valid_configs.push_back({tile_N, cfg});
+      }
+    }
 
-  TVM_FFI_ICHECK_EQ(gemm2_weights.ndim(), 3) << "gemm2_weights must be 3D.";
-  // / 2 to compensate for the fact that we pack 2 e2m1 into 1 byte.
-  TVM_FFI_ICHECK_EQ(gemm2_weights.size(2), intermediate_size / 2)
-      << "the third dimension of weights must be equal to intermediate_size.";
+    return valid_configs;
+  }
+};
 
-  TVM_FFI_ICHECK_EQ(gemm2_weights_scale.dtype(), dl_float8_e4m3fn)
-      << "gemm2_weights_scale must be fp8.";
+class FP4BlockScaleLauncher : public FusedMoeLauncher {
+ public:
+  static constexpr std::array<int32_t, 4> mBaseSupportedTileNums = {8, 16, 32, 64};
 
-  TVM_FFI_ICHECK_EQ(gemm2_weights_scale.ndim(), 3) << "gemm2_weights_scale must be 3D.";
-  TVM_FFI_ICHECK_EQ(gemm2_weights_scale.size(0), local_num_experts)
-      << "gemm2_weights_scale has incorrect dim 0.";
-  TVM_FFI_ICHECK_EQ(gemm2_weights_scale.size(1), args.hidden_size)
-      << "gemm2_weights_scale has incorrect dim 1.";
-  TVM_FFI_ICHECK_EQ(gemm2_weights_scale.size(2), intermediate_size / sf_vec_size)
-      << "gemm2_weights_scale has incorrect dim 2.";
+  static std::vector<int32_t> getSupportedTileNums(btg::Dtype dtype_act) {
+    std::vector<int32_t> tiles(mBaseSupportedTileNums.begin(), mBaseSupportedTileNums.end());
+    if (dtype_act != btg::Dtype::Bfloat16) {
+      tiles.push_back(128);
+      tiles.push_back(256);
+    }
+    return tiles;
+  }
 
-  if (output1_scales_scalar.has_value()) {
-    TVM_FFI_ICHECK_EQ(output1_scales_scalar.value().dtype(), dl_float32)
-        << "output1_scales_scalar must be float.";
-    TVM_FFI_ICHECK_EQ(output1_scales_scalar.value().ndim(), 1)
-        << "output1_scales_scalar must be 1D.";
-    TVM_FFI_ICHECK_EQ(output1_scales_scalar.value().size(0), local_num_experts)
-        << "output1_scales_scalar has incorrect dim 0.";
+  FP4BlockScaleLauncher(
+      Optional<TensorView> const& routing_logits, Optional<TensorView> const& routing_bias,
+      TensorView const& hidden_states, Optional<TensorView> const& hidden_states_scale,
+      TensorView const& gemm1_weights, TensorView const& gemm1_weights_scale,
+      Optional<TensorView> const& gemm1_bias, Optional<TensorView> const& gemm1_alpha,
+      Optional<TensorView> const& gemm1_beta, Optional<TensorView> const& gemm1_clamp_limit,
+      TensorView const& gemm2_weights, TensorView const& gemm2_weights_scale,
+      Optional<TensorView> const& gemm2_bias, Optional<TensorView> const& output1_scales_scalar,
+      Optional<TensorView> const& output1_scales_gate_scalar,
+      Optional<TensorView> const& output2_scales_scalar, TensorView const& expert_indices,
+      TensorView const& expert_weights)
+      : FusedMoeLauncher(routing_logits, routing_bias, hidden_states, gemm1_weights,
+                         output1_scales_scalar, output1_scales_gate_scalar, gemm2_weights,
+                         output2_scales_scalar),
+        hidden_states_scale(hidden_states_scale),
+        gemm1_weights_scale(gemm1_weights_scale),
+        gemm1_bias(gemm1_bias),
+        gemm1_alpha(gemm1_alpha),
+        gemm1_beta(gemm1_beta),
+        gemm1_clamp_limit(gemm1_clamp_limit),
+        gemm2_weights_scale(gemm2_weights_scale),
+        gemm2_bias(gemm2_bias),
+        expert_indices(expert_indices),
+        expert_weights(expert_weights) {}
+
+  void init(std::unique_ptr<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>&& args,
+            int64_t tile_tokens_dim, int64_t routing_method_type, bool use_shuffled_weight,
+            int64_t weight_layout, int64_t gated_act_type, btg::Dtype dtype_act,
+            btg::Dtype dtype_weights) {
+    static const std::tuple<int, int> device_props = [this] {
+      int major, minor;
+      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor,
+                             hidden_states.device().device_id);
+      cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor,
+                             hidden_states.device().device_id);
+      return std::make_tuple(major, minor);
+    }();
+
+    TVM_FFI_ICHECK_EQ(std::get<0>(device_props), 10)
+        << "This kernel requires 10.x architecture. Current device has SM "
+        << std::get<0>(device_props) << std::get<1>(device_props);
+
+    // Set data types
+    args->mDtypeElt = dtype_act;
+    args->mDtypeOut = btg::Dtype::Bfloat16;  // Output is always BF16 for FP4
+    args->mUseDeepSeekFp8 = false;           // FP4 doesn't use DeepSeek FP8
+
+    mDtypeAct = dtype_act;
+    mDtypeWeights = dtype_weights;
+
+    FusedMoeLauncher::init_common(std::move(args), tile_tokens_dim, routing_method_type,
+                                  use_shuffled_weight, weight_layout, gated_act_type);
   }
 
-  if (output1_scales_gate_scalar.has_value()) {
-    TVM_FFI_ICHECK_EQ(output1_scales_gate_scalar.value().dtype(), dl_float32)
-        << "output1_scales_gate_scalar must be float.";
-    TVM_FFI_ICHECK_EQ(output1_scales_gate_scalar.value().ndim(), 1)
-        << "output1_scales_gate_scalar must be 1D.";
-    TVM_FFI_ICHECK_EQ(output1_scales_gate_scalar.value().size(0), local_num_experts)
-        << "output1_scales_gate_scalar has incorrect dim 0.";
+  void check_routing() const override {
+    // First call base class common routing checks
+    FusedMoeLauncher::check_routing_common();
   }
 
-  if (output2_scales_scalar.has_value()) {
-    TVM_FFI_ICHECK_EQ(output2_scales_scalar.value().dtype(), dl_float32)
-        << "output2_scales_scalar must be float.";
-    TVM_FFI_ICHECK_EQ(output2_scales_scalar.value().ndim(), 1)
-        << "output2_scales_scalar must be 1D.";
-    TVM_FFI_ICHECK_EQ(output2_scales_scalar.value().size(0), local_num_experts)
-        << "output2_scales_scalar has incorrect dim 0.";
+  void prepare_routing() override {
+    num_tokens_per_expert = alloc_tensor({args->num_experts}, dl_int32, hidden_states.device());
+    int32_t max_num_padded_tokens =
+        tensorrt_llm::kernels::trtllmgen_moe::Routing::getMaxPermutedPaddedCount(
+            args->num_tokens, args->top_k, args->num_experts, tile_tokens_dim);
+
+    total_num_padded_tokens = alloc_tensor({1}, dl_int32, hidden_states.device());
+    expanded_idx_to_permuted_idx =
+        alloc_tensor({args->num_tokens * args->top_k}, dl_int32, hidden_states.device());
+    permuted_idx_to_token_idx =
+        alloc_tensor({max_num_padded_tokens}, dl_int32, hidden_states.device());
+
+    int64_t const size_of_expert_count_histogram = std::max(args->num_experts * 2, 256 * 2);
+    expert_count_histogram =
+        alloc_tensor({size_of_expert_count_histogram}, dl_int32, hidden_states.device());
+
+    int32_t max_num_ctas = tensorrt_llm::kernels::trtllmgen_moe::Routing::getMaxNumCtasInBatchDim(
+        args->num_tokens, args->top_k, args->num_experts, tile_tokens_dim);
+    cta_idx_xy_to_batch_idx = alloc_tensor({max_num_ctas}, dl_int32, hidden_states.device());
+    cta_idx_xy_to_mn_limit = alloc_tensor({max_num_ctas}, dl_int32, hidden_states.device());
+    num_non_exiting_ctas = alloc_tensor({1}, dl_int32, hidden_states.device());
+
+    workspace.total_num_padded_tokens = static_cast<int*>(total_num_padded_tokens.data_ptr());
+    workspace.total_max_padded_tokens = max_num_padded_tokens;
+    workspace.ProjUpTileN = tile_tokens_dim;
+    workspace.routing_expert_indexes =
+        static_cast<int*>(const_cast<void*>(expert_indices.data_ptr()));
+    workspace.expert_weights = const_cast<void*>(expert_weights.data_ptr());
+    workspace.permuted_idx_size = static_cast<int*>(total_num_padded_tokens.data_ptr());
+    workspace.expanded_idx_to_permuted_idx =
+        static_cast<int*>(expanded_idx_to_permuted_idx.data_ptr());
+    workspace.permuted_idx_to_token_idx = static_cast<int*>(permuted_idx_to_token_idx.data_ptr());
+    workspace.cta_idx_xy_to_batch_idx = static_cast<int*>(cta_idx_xy_to_batch_idx.data_ptr());
+    workspace.cta_idx_xy_to_mn_limit = static_cast<int*>(cta_idx_xy_to_mn_limit.data_ptr());
+    workspace.num_non_exiting_ctas = static_cast<int*>(num_non_exiting_ctas.data_ptr());
+
+    args->mDtypeElt = mDtypeAct;
+    auto routing_bias_dtype = routing_bias.has_value() ? routing_bias.value().dtype() : dl_bfloat16;
+    mRoutingBiasDtype = routing_bias_dtype == dl_bfloat16 ? btg::Dtype::Bfloat16 : btg::Dtype::Fp32;
   }
 
-  // setup workspace
-  workspace.total_num_padded_tokens = static_cast<int*>(total_num_padded_tokens.data_ptr());
-  workspace.total_max_padded_tokens =
-      std::max(max_num_padded_tokens_gemm1, max_num_padded_tokens_gemm2);
-  workspace.ProjUpTileN = tile_tokens_dim;
-  workspace.routing_expert_indexes = static_cast<int*>(expert_indices.data_ptr());
-  workspace.permuted_idx_size = static_cast<int*>(total_num_padded_tokens.data_ptr());
-  workspace.expanded_idx_to_permuted_idx = static_cast<int*>(
-      expanded_idx_to_permuted_idx.data_ptr());  // Needed by permute/finalize kernels
-  workspace.permuted_idx_to_token_idx =
-      static_cast<int*>(permuted_idx_to_token_idx.data_ptr());  // Needed by permuteGemm1 kernel
-  workspace.expert_weights = expert_weights.data_ptr();         // Consumed by finalize kernel
-
-  workspace.cta_idx_xy_to_batch_idx = static_cast<int*>(cta_idx_xy_to_batch_idx.data_ptr());
-  workspace.cta_idx_xy_to_mn_limit = static_cast<int*>(cta_idx_xy_to_mn_limit.data_ptr());
-  workspace.num_non_exiting_ctas = static_cast<int*>(num_non_exiting_ctas.data_ptr());
-
-  workspace.hidden_states_scale_linear = nullptr;
-
-  // gemm1 intermediate ws
-  workspace.gemm1_output = gemm1_output.data_ptr();
-  workspace.gemm1_output_scale = gemm1_output_scale.has_value()
-                                     ? static_cast<float*>(gemm1_output_scale.value().data_ptr())
-                                     : nullptr;
-  // gemm2 intermediate ws
-  workspace.gemm2_output = gemm2_output.data_ptr();
-  workspace.gemm2_output_scale = nullptr;
-  args.output = output.data_ptr();
-  args.output_scale = nullptr;
-  args.output1_scales_scalar = output1_scales_scalar.has_value()
-                                   ? static_cast<float*>(output1_scales_scalar.value().data_ptr())
-                                   : nullptr;
-  args.output1_scales_gate_scalar =
-      output1_scales_gate_scalar.has_value()
-          ? static_cast<float*>(output1_scales_gate_scalar.value().data_ptr())
-          : nullptr;
-  args.output2_scales_scalar = output2_scales_scalar.has_value()
-                                   ? static_cast<float*>(output2_scales_scalar.value().data_ptr())
-                                   : nullptr;
-  args.do_finalize = do_finalize;
-
-  auto const workspace_sizes = moe_runner.getWorkspaceSizeInBytes(args, moeConfigIndex);
-
-  Tensor workspace_fc1 =
-      alloc_tensor({std::get<0>(workspace_sizes)}, dl_int8, hidden_states.device());
-  Tensor workspace_fc2 =
-      alloc_tensor({std::get<1>(workspace_sizes)}, dl_int8, hidden_states.device());
-  workspace.bmm1_workspace = workspace_fc1.data_ptr();
-  workspace.bmm2_workspace = workspace_fc2.data_ptr();
-  cudaStream_t moe_stream = get_stream(hidden_states.device());
-  moe_runner.run(args, workspace, hidden_states.device().device_id, moe_stream, moeConfigIndex,
-                 enable_pdl);
-
-  if (!do_finalize) {
+  void check_moe() const override {
+    TVM_FFI_ICHECK(mDtypeAct == btg::Dtype::E2m1 || mDtypeAct == btg::Dtype::Bfloat16 ||
+                   mDtypeAct == btg::Dtype::E4m3 || mDtypeAct == btg::Dtype::MxE4m3)
+        << "Only E2m1, Bfloat16, MxE4m3 and E4m3 are supported by Fp4 block scale MoE";
+
+    if (mDtypeAct == btg::Dtype::E2m1) {
+      TVM_FFI_ICHECK(mDtypeWeights == btg::Dtype::E2m1)
+          << "Only E2m1 and MxE2m1 are supported by block scale MoE with E2m1 activation";
+      TVM_FFI_ICHECK(hidden_states_scale.has_value())
+          << "hidden_states_scale is required for E2m1 activation";
+      TVM_FFI_ICHECK(output1_scales_scalar.has_value())
+          << "output1_scales_scalar is required for E2m1 activation";
+      TVM_FFI_ICHECK(output1_scales_gate_scalar.has_value())
+          << "output1_scales_gate_scalar is required for E2m1 activation";
+      TVM_FFI_ICHECK(output2_scales_scalar.has_value())
+          << "output2_scales_scalar is required for E2m1 activation";
+    } else if (mDtypeAct == btg::Dtype::Bfloat16 || mDtypeAct == btg::Dtype::E4m3 ||
+               mDtypeAct == btg::Dtype::MxE4m3) {
+      TVM_FFI_ICHECK(mDtypeWeights == btg::Dtype::MxE2m1)
+          << "Only MxE2m1 weights are supported by block scale MoE with Bfloat16, E4m3 or "
+             "MxE4m3 activation";
+    }
+
+    if (mDtypeAct == btg::Dtype::E4m3) {
+      TVM_FFI_ICHECK(output1_scales_scalar.has_value())
+          << "output1_scales_scalar is required for E4m3 activation";
+      TVM_FFI_ICHECK(output1_scales_gate_scalar.has_value())
+          << "output1_scales_gate_scalar is required for E4m3 activation";
+      TVM_FFI_ICHECK(output2_scales_scalar.has_value())
+          << "output2_scales_scalar is required for E4m3 activation";
+    }
+
+    TVM_FFI_ICHECK_EQ(gemm1_weights.dtype(), dl_uint8) << "gemm1_weights must be byte.";
+    TVM_FFI_ICHECK_EQ(gemm1_weights_scale.dtype(), dl_float8_e4m3fn)
+        << "gemm1_weights_scale must be fp8.";
+    TVM_FFI_ICHECK_EQ(gemm2_weights.dtype(), dl_uint8) << "gemm2_weights must be byte.";
+    TVM_FFI_ICHECK_EQ(gemm2_weights_scale.dtype(), dl_float8_e4m3fn)
+        << "gemm2_weights_scale must be fp8.";
+  }
+
+  void prepare_moe(int64_t& moe_tactic) override {
+    args->hidden_states = hidden_states.data_ptr();
+    args->hidden_states_scale =
+        hidden_states_scale.has_value() ? hidden_states_scale.value().data_ptr() : nullptr;
+    args->gemm1_weights = gemm1_weights.data_ptr();
+    args->gemm1_weights_scale = gemm1_weights_scale.data_ptr();
+    args->gemm1_bias =
+        gemm1_bias.has_value() ? static_cast<float*>(gemm1_bias.value().data_ptr()) : nullptr;
+    args->gemm1_alpha =
+        gemm1_alpha.has_value() ? static_cast<float*>(gemm1_alpha.value().data_ptr()) : nullptr;
+    args->gemm1_beta =
+        gemm1_beta.has_value() ? static_cast<float*>(gemm1_beta.value().data_ptr()) : nullptr;
+    args->gemm1_clamp_limit = gemm1_clamp_limit.has_value()
+                                  ? static_cast<float*>(gemm1_clamp_limit.value().data_ptr())
+                                  : nullptr;
+    args->gemm2_weights = gemm2_weights.data_ptr();
+    args->gemm2_weights_scale = gemm2_weights_scale.data_ptr();
+    args->gemm2_bias =
+        gemm2_bias.has_value() ? static_cast<float*>(gemm2_bias.value().data_ptr()) : nullptr;
+    args->output1_scales_scalar =
+        output1_scales_scalar.has_value()
+            ? static_cast<float*>(output1_scales_scalar.value().data_ptr())
+            : nullptr;
+    args->output1_scales_gate_scalar =
+        output1_scales_gate_scalar.has_value()
+            ? static_cast<float*>(output1_scales_gate_scalar.value().data_ptr())
+            : nullptr;
+    args->output2_scales_scalar =
+        output2_scales_scalar.has_value()
+            ? static_cast<float*>(output2_scales_scalar.value().data_ptr())
+            : nullptr;
+
+    FusedMoeLauncher::prepare_moe_common(moe_tactic);
+
+    auto const sf_vec_size = mDtypeWeights == btg::Dtype::MxE2m1 ? 32 : 16;
+
+    max_num_padded_tokens_gemm1 =
+        tensorrt_llm::kernels::trtllmgen_moe::Routing::maybeGetMinTokenCount(
+            workspace.total_max_padded_tokens, args->intermediate_size,
+            btg::dtypeGetNumBits(mDtypeAct));
+    max_num_padded_tokens_gemm2 =
+        tensorrt_llm::kernels::trtllmgen_moe::Routing::maybeGetMinTokenCount(
+            workspace.total_max_padded_tokens, args->hidden_size,
+            btg::dtypeGetNumBits(btg::Dtype::Bfloat16));  // Output is always BF16
+
+    auto const gemm1_output_hidden =
+        mDtypeAct == btg::Dtype::E2m1 ? args->intermediate_size / 2 : args->intermediate_size;
+    gemm1_output = alloc_tensor({max_num_padded_tokens_gemm1, gemm1_output_hidden},
+                                mDtypeAct == btg::Dtype::Bfloat16 ? dl_bfloat16 : dl_uint8,
+                                hidden_states.device());
+
+    if (mDtypeAct == btg::Dtype::E2m1 || mDtypeAct == btg::Dtype::MxE4m3) {
+      int64_t sf_size = tensorrt_llm::computeSwizzledLayoutSFSize(
+          max_num_padded_tokens_gemm1, args->intermediate_size / sf_vec_size);
+      gemm1_output_scale = alloc_tensor({sf_size}, dl_uint8, hidden_states.device());
+    }
+
+    // Allocate gemm2_output
+    gemm2_output = alloc_tensor({max_num_padded_tokens_gemm2, args->hidden_size}, dl_bfloat16,
+                                hidden_states.device());
+
+    // Setup workspace pointers
+    workspace.hidden_states_scale_linear = nullptr;  // FP4 doesn't use linear scale
+    workspace.gemm1_output = gemm1_output.data_ptr();
+    workspace.gemm1_output_scale = gemm1_output_scale.has_value()
+                                       ? static_cast<float*>(gemm1_output_scale.value().data_ptr())
+                                       : nullptr;
+    // Note: activation_output and activation_output_scale are set by the base class
+    // prepare_moe_common() when gated activation is used
+    workspace.gemm2_output = gemm2_output.data_ptr();
+    workspace.gemm2_output_scale = nullptr;
+  }
+
+ private:
+  Optional<TensorView> hidden_states_scale;
+  TensorView gemm1_weights_scale;
+  Optional<TensorView> gemm1_bias;
+  Optional<TensorView> gemm1_alpha;
+  Optional<TensorView> gemm1_beta;
+  Optional<TensorView> gemm1_clamp_limit;
+  TensorView gemm2_weights_scale;
+  Optional<TensorView> gemm2_bias;
+  int32_t max_num_padded_tokens_gemm1{};
+  int32_t max_num_padded_tokens_gemm2{};
+  Optional<Tensor> gemm1_output_scale;
+  TensorView expert_indices;
+  TensorView expert_weights;
+
+ public:
+  Array<Tensor> run(int64_t moe_tactic, bool enable_pdl = true,
+                    bool use_routing_scales_on_input = false,
+                    bool use_deep_seek_fp8 = false) override {
+    check_routing();
+    prepare_routing();
+
+    // Execute routing
+    tensorrt_llm::kernels::trtllmgen_moe::Routing::Runner routing_runner(tile_tokens_dim);
+    cudaStream_t routing_stream = get_stream(hidden_states.device());
+
+    routing_runner.run(
+        args->routing_logits, args->routing_bias, args->num_tokens, args->num_experts, args->top_k,
+        args->n_group, args->topk_group, args->local_expert_offset, args->local_num_experts,
+        args->routed_scaling_factor, static_cast<int*>(expert_indices.data_ptr()),
+        static_cast<int*>(expert_count_histogram.data_ptr()),
+        static_cast<int*>(total_num_padded_tokens.data_ptr()),
+        static_cast<int*>(expanded_idx_to_permuted_idx.data_ptr()),
+        nullptr /*permuted_idx_to_expanded_idx.data_ptr()*/,
+        static_cast<int*>(permuted_idx_to_token_idx.data_ptr()), expert_weights.data_ptr(),
+        static_cast<int*>(num_tokens_per_expert.data_ptr()),
+        static_cast<int*>(cta_idx_xy_to_batch_idx.data_ptr()),
+        static_cast<int*>(cta_idx_xy_to_mn_limit.data_ptr()),
+        static_cast<int*>(num_non_exiting_ctas.data_ptr()), args->mDtypeElt, mRoutingBiasDtype,
+        use_routing_scales_on_input, use_deep_seek_fp8,
+        static_cast<RoutingMethodType>(routing_method_type), routing_stream);
+
+    check_moe();
+    prepare_moe(moe_tactic);
+
+    cudaStream_t moe_stream = get_stream(hidden_states.device());
+    moe_runner->run(*args, workspace, hidden_states.device().device_id, moe_stream, moe_tactic,
+                    enable_pdl);
+
+    // Match original FP4 behavior for return values
+    if (args->do_finalize) {
+      return {};
+    }
     return {gemm2_output, expanded_idx_to_permuted_idx};
   }
-  return {};
+
+  static Array<Array<int64_t>> getValidConfigs(int64_t top_k, int64_t hidden_size,
+                                               int64_t intermediate_size, int64_t num_local_experts,
+                                               int64_t num_tokens, int64_t gated_act_type,
+                                               btg::Dtype dtype_act, btg::Dtype dtype_weights) {
+    Array<Array<int64_t>> valid_configs;
+
+    std::vector<int32_t> tile_sizes = getSupportedTileNums(dtype_act);
+    std::set<int32_t> selected_tile_nums =
+        computeSelectedTileN(tile_sizes, num_tokens, top_k, num_local_experts);
+
+    for (int32_t tile_N : selected_tile_nums) {
+      auto moe_runner = std::make_unique<tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner>(
+          dtype_act, dtype_weights,
+          false,  // useDeepSeekFp8
+          tile_N, static_cast<GatedActType>(gated_act_type),
+          /*useShuffledMatrixA*/ true);  // FP4 uses shuffled weights
+
+      auto cfgs = moe_runner->getValidConfigIndices(top_k, hidden_size, intermediate_size,
+                                                    num_local_experts, num_tokens);
+
+      for (auto cfg : cfgs) {
+        valid_configs.push_back({tile_N, cfg});
+      }
+    }
+
+    return valid_configs;
+  }
+};
+
+Tensor trtllm_bf16_moe(TensorView const& routing_logits, Optional<TensorView> const& routing_bias,
+                       TensorView const& hidden_states, TensorView const& gemm1_weights,
+                       TensorView const& gemm2_weights, int64_t num_experts, int64_t top_k,
+                       Optional<int64_t> n_group, Optional<int64_t> topk_group,
+                       int64_t intermediate_size, int64_t local_expert_offset,
+                       int64_t local_num_experts, int64_t routing_method_type,
+                       bool use_shuffled_weight, int64_t weight_layout, bool enable_pdl,
+                       Array<int64_t> moe_tactic) {
+  // Just some basic type validation first and leave more checks to the launcher
+  TVM_FFI_ICHECK(routing_logits.dtype() == dl_float32 || routing_logits.dtype() == dl_bfloat16)
+      << "BF16 MoE: routing_logits must be bfloat16 or float.";
+  TVM_FFI_ICHECK_EQ(hidden_states.dtype(), dl_bfloat16)
+      << "BF16 MoE: hidden_states must be bfloat16.";
+  TVM_FFI_ICHECK_EQ(gemm1_weights.dtype(), dl_bfloat16)
+      << "BF16 MoE: gemm1_weights must be bfloat16.";
+  TVM_FFI_ICHECK_EQ(gemm2_weights.dtype(), dl_bfloat16)
+      << "BF16 MoE: gemm2_weights must be bfloat16.";
+
+  auto const num_tokens = hidden_states.size(0);
+  auto const hidden_size = hidden_states.size(1);
+
+  // Calculate supported tile sizes
+  std::vector<int32_t> mSupportedTileN(Bf16MoeLauncher::mSupportedTileNums.begin(),
+                                       Bf16MoeLauncher::mSupportedTileNums.end());
+  std::set<int32_t> selected_tile_nums =
+      computeSelectedTileN(mSupportedTileN, num_tokens, top_k, local_num_experts);
+
+  // Create a map of launchers for each tile size
+  std::unordered_map<int32_t, std::unique_ptr<Bf16MoeLauncher>> launchers_map;
+
+  for (int32_t curr_tile_N : selected_tile_nums) {
+    // Create MoE arguments for this launcher
+    auto args = std::make_unique<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>();
+    args->num_tokens = num_tokens;
+    args->num_experts = num_experts;
+    args->hidden_size = hidden_size;
+    args->hidden_size_output = args->hidden_size;
+    args->top_k = top_k;
+    args->n_group = n_group.value_or(0);
+    args->topk_group = topk_group.value_or(0);
+    ;
+    args->local_expert_offset = local_expert_offset;
+    args->local_num_experts = local_num_experts;
+    args->intermediate_size = intermediate_size;
+
+    // Create and initialize launcher for this tile size
+    auto launcher = std::make_unique<Bf16MoeLauncher>(routing_logits, routing_bias, hidden_states,
+                                                      gemm1_weights, gemm2_weights);
+    launcher->init(std::move(args), curr_tile_N, routing_method_type, use_shuffled_weight,
+                   weight_layout);
+
+    launchers_map[curr_tile_N] = std::move(launcher);
+  }
+
+  // Extract tile_N and config from moe_tactic
+  int64_t tile_N = moe_tactic[0];
+  int64_t config = moe_tactic[1];
+
+  // Handle default case
+  if (tile_N == -1 || config == -1) {
+    tile_N = *selected_tile_nums.begin();
+  }
+
+  // Get the launcher for the selected tile_N
+  auto& selected_launcher = launchers_map.at(tile_N);
+
+  // Run the launcher - it will create its own runner internally
+  auto result = selected_launcher->run(config, enable_pdl)[0];
+  return result;
+}
+
+Tensor trtllm_fp8_per_tensor_scale_moe(
+    TensorView routing_logits, Optional<TensorView> routing_bias, TensorView hidden_states,
+    TensorView gemm1_weights, TensorView output1_scales_scalar,
+    TensorView output1_scales_gate_scalar, TensorView gemm2_weights,
+    TensorView output2_scales_scalar, TensorView output, int64_t num_experts, int64_t top_k,
+    Optional<int64_t> n_group, Optional<int64_t> topk_group, int64_t intermediate_size,
+    int64_t local_expert_offset, int64_t local_num_experts, Optional<double> routed_scaling_factor,
+    bool use_routing_scales_on_input, int64_t routing_method_type, bool enable_pdl,
+    Array<int64_t> config_index) {
+  // Basic type validation
+  auto dtype = hidden_states.dtype();
+  if (use_routing_scales_on_input) {
+    TVM_FFI_ICHECK_EQ(routing_logits.dtype(), dl_bfloat16) << "routing_logits must be bfloat16.";
+  } else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::DeepSeekV3) {
+    TVM_FFI_ICHECK_EQ(routing_logits.dtype(), dl_float32) << "routing_logits must be float.";
+  } else {
+    TVM_FFI_ICHECK_EQ(routing_logits.dtype(), dl_bfloat16) << "routing_logits must be bfloat16.";
+  }
+  TVM_FFI_ICHECK(dtype == dl_float8_e4m3fn || dtype == dl_float16 || dtype == dl_bfloat16)
+      << "FP8 MoE: hidden_states must be float8_e4m3fn, float16, or bfloat16.";
+  TVM_FFI_ICHECK_EQ(gemm1_weights.dtype(), dl_float8_e4m3fn)
+      << "FP8 MoE: gemm1_weights must be float8_e4m3fn.";
+  TVM_FFI_ICHECK_EQ(gemm2_weights.dtype(), dl_float8_e4m3fn)
+      << "FP8 MoE: gemm2_weights must be float8_e4m3fn.";
+  TVM_FFI_ICHECK_EQ(output1_scales_scalar.dtype(), dl_float32)
+      << "FP8 MoE: output1_scales_scalar must be float32.";
+  TVM_FFI_ICHECK_EQ(output1_scales_gate_scalar.dtype(), dl_float32)
+      << "FP8 MoE: output1_scales_gate_scalar must be float32.";
+  TVM_FFI_ICHECK_EQ(output2_scales_scalar.dtype(), dl_float32)
+      << "FP8 MoE: output2_scales_scalar must be float32.";
+
+  auto const num_tokens = hidden_states.size(0);
+  auto const hidden_size = hidden_states.size(1);
+
+  // Use default values that match the original function behavior
+  bool use_shuffled_weight = true;  // Original uses /*useShuffledMatrixA*/ true
+  int64_t weight_layout = 0;        // Default to MajorK
+
+  // Calculate supported tile sizes
+  std::vector<int32_t> mSupportedTileN(Fp8PerTensorLauncher::mSupportedTileNums.begin(),
+                                       Fp8PerTensorLauncher::mSupportedTileNums.end());
+  std::set<int32_t> selected_tile_nums =
+      computeSelectedTileN(mSupportedTileN, num_tokens, top_k, local_num_experts);
+
+  // Create a map of launchers for each tile size
+  std::unordered_map<int32_t, std::unique_ptr<Fp8PerTensorLauncher>> launchers_map;
+
+  for (int32_t curr_tile_N : selected_tile_nums) {
+    // Create MoE arguments for this launcher
+    auto args = std::make_unique<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>();
+    args->num_tokens = num_tokens;
+    args->num_experts = num_experts;
+    args->hidden_size = hidden_size;
+    args->hidden_size_output = args->hidden_size;
+    args->top_k = top_k;
+    args->n_group = n_group.value_or(0);
+    args->topk_group = topk_group.value_or(0);
+    args->local_expert_offset = local_expert_offset;
+    args->local_num_experts = local_num_experts;
+    args->intermediate_size = intermediate_size;
+    args->routed_scaling_factor = routed_scaling_factor.value_or(1.0);
+
+    // Create and initialize launcher for this tile size
+    auto launcher = std::make_unique<Fp8PerTensorLauncher>(
+        routing_logits, routing_bias, hidden_states, gemm1_weights, output1_scales_scalar,
+        output1_scales_gate_scalar, gemm2_weights, output2_scales_scalar);
+    launcher->init(std::move(args), curr_tile_N, routing_method_type, use_shuffled_weight,
+                   weight_layout, use_routing_scales_on_input);
+
+    launchers_map[curr_tile_N] = std::move(launcher);
+  }
+
+  // Extract tile_N and config from config_index
+  int64_t tile_N = config_index[0];
+  int64_t config = config_index[1];
+
+  // Handle default case
+  if (tile_N == -1 || config == -1) {
+    tile_N = *selected_tile_nums.begin();
+  }
+
+  // Get the launcher for the selected tile_N
+  auto& selected_launcher = launchers_map.at(tile_N);
+
+  // Run the launcher - it will create its own runner internally
+  auto result = selected_launcher->run(config, enable_pdl, use_routing_scales_on_input)[0];
+  // Return the result tensor
+  return result;
+}
+
+Tensor trtllm_fp8_block_scale_moe(
+    TensorView routing_logits, Optional<TensorView> routing_bias, TensorView hidden_states,
+    TensorView hidden_states_scale, TensorView gemm1_weights, TensorView gemm1_weights_scale,
+    TensorView gemm2_weights, TensorView gemm2_weights_scale, TensorView output,
+    int64_t num_experts, int64_t top_k, Optional<int64_t> n_group, Optional<int64_t> topk_group,
+    int64_t intermediate_size, int64_t local_expert_offset, int64_t local_num_experts,
+    Optional<double> routed_scaling_factor, int64_t routing_method_type, bool use_shuffled_weight,
+    int64_t weight_layout, bool enable_pdl, Array<int64_t> config_index) {
+  // Basic type validation
+  auto dtype = hidden_states.dtype();
+  if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::DeepSeekV3) {
+    TVM_FFI_ICHECK_EQ(routing_logits.dtype(), dl_float32) << "routing_logits must be float.";
+  } else {
+    TVM_FFI_ICHECK_EQ(routing_logits.dtype(), dl_bfloat16) << "routing_logits must be bfloat16.";
+  }
+  TVM_FFI_ICHECK(dtype == dl_float16 || dtype == dl_bfloat16 || dtype == dl_float8_e4m3fn)
+      << "FP8 block scale MoE: hidden_states must be fp16, bf16, or fp8.";
+  TVM_FFI_ICHECK_EQ(hidden_states_scale.dtype(), dl_float32)
+      << "FP8 block scale MoE: hidden_states_scale must be float32.";
+  TVM_FFI_ICHECK_EQ(gemm1_weights.dtype(), dl_float8_e4m3fn)
+      << "FP8 block scale MoE: gemm1_weights must be fp8.";
+  TVM_FFI_ICHECK_EQ(gemm1_weights_scale.dtype(), dl_float32)
+      << "FP8 block scale MoE: gemm1_weights_scale must be float32.";
+  TVM_FFI_ICHECK_EQ(gemm2_weights.dtype(), dl_float8_e4m3fn)
+      << "FP8 block scale MoE: gemm2_weights must be fp8.";
+  TVM_FFI_ICHECK_EQ(gemm2_weights_scale.dtype(), dl_float32)
+      << "FP8 block scale MoE: gemm2_weights_scale must be float32.";
+
+  auto const num_tokens = hidden_states.size(0);
+  auto const hidden_size = hidden_states.size(1);
+
+  std::vector<int32_t> mSupportedTileN(Fp8BlockScaleLauncher::mSupportedTileNums.begin(),
+                                       Fp8BlockScaleLauncher::mSupportedTileNums.end());
+  std::set<int32_t> selected_tile_nums =
+      computeSelectedTileN(mSupportedTileN, num_tokens, top_k, local_num_experts);
+
+  // Create a map of launchers for each tile size
+  std::unordered_map<int32_t, std::unique_ptr<Fp8BlockScaleLauncher>> launchers_map;
+
+  for (int32_t curr_tile_N : selected_tile_nums) {
+    // Create MoE arguments for this launcher
+    auto args = std::make_unique<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>();
+    args->num_tokens = num_tokens;
+    args->num_experts = num_experts;
+    args->hidden_size = hidden_size;
+    args->hidden_size_output = args->hidden_size;
+    args->top_k = top_k;
+    args->n_group = n_group.value_or(0);
+    args->topk_group = topk_group.value_or(0);
+    args->local_expert_offset = local_expert_offset;
+    args->local_num_experts = local_num_experts;
+    args->intermediate_size = intermediate_size;
+    args->routed_scaling_factor = routed_scaling_factor.value_or(1.0);
+
+    // Create and initialize launcher for this tile size
+    auto launcher = std::make_unique<Fp8BlockScaleLauncher>(
+        routing_logits, routing_bias, hidden_states, hidden_states_scale, gemm1_weights,
+        gemm1_weights_scale, gemm2_weights, gemm2_weights_scale);
+    launcher->init(std::move(args), curr_tile_N, routing_method_type, use_shuffled_weight,
+                   weight_layout);
+
+    launchers_map[curr_tile_N] = std::move(launcher);
+  }
+
+  // Extract tile_N and config from config_index
+  int64_t tile_N = config_index[0];
+  int64_t config = config_index[1];
+
+  // Handle default case
+  if (tile_N == -1 || config == -1) {
+    tile_N = *selected_tile_nums.begin();
+  }
+
+  // Get the launcher for the selected tile_N
+  auto& selected_launcher = launchers_map.at(tile_N);
+
+  // Run the launcher with DeepSeek FP8 enabled - it will create its own runner internally
+  auto result = selected_launcher->run(config, enable_pdl, false /* use_routing_scales_on_input */,
+                                       true /* use_deep_seek_fp8 */)[0];
+  // Return the result tensor
+  return result;
 }
 
 Array<Tensor> trtllm_fp4_block_scale_moe(
@@ -1101,28 +1659,53 @@ Array<Tensor> trtllm_fp4_block_scale_moe(
     Optional<TensorView> output2_scales_scalar, int64_t num_experts, int64_t top_k,
     Optional<int64_t> n_group, Optional<int64_t> topk_group, int64_t intermediate_size,
     int64_t local_expert_offset, int64_t local_num_experts, Optional<double> routed_scaling_factor,
-    int64_t tile_tokens_dim, int64_t routing_method_type, bool do_finalize, bool enable_pdl,
-    int64_t gated_act_type, TensorView output, int64_t config_index) {
-  using RunnerType = tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner;
-
+    int64_t routing_method_type, bool do_finalize, bool enable_pdl, int64_t gated_act_type,
+    TensorView output, Array<int64_t> config_index) {
+  // Determine data types based on input format
   int const num_tokens = hidden_states.size(0);
   int hidden_size = hidden_states.size(1);
   if (hidden_states.dtype() == dl_uint8) hidden_size *= 2;
+
   int hidden_states_scale_vec_size = -1;
   if (hidden_states_scale.has_value()) {
     hidden_states_scale_vec_size = (num_tokens * hidden_size) / hidden_states_scale.value().numel();
   }
   int weight_scale_vec_size =
       (local_num_experts * intermediate_size * 2 * hidden_size) / gemm1_weights_scale.numel();
+
   TVM_FFI_ICHECK(weight_scale_vec_size == 16 || weight_scale_vec_size == 32)
       << "unsupported weight_scale_vec_size.";
   auto mDtypeWeights = weight_scale_vec_size == 16 ? btg::Dtype::E2m1 : btg::Dtype::MxE2m1;
 
+  if (routing_logits.has_value()) {
+    TVM_FFI_ICHECK(routing_logits.value().dtype() == dl_float32 ||
+                   routing_logits.value().dtype() == dl_bfloat16)
+        << "routing_logits must be float or bfloat16.";
+    TVM_FFI_ICHECK_EQ(routing_logits.value().ndim(), 2) << "routing_logits must be 2D.";
+    TVM_FFI_ICHECK_EQ(routing_logits.value().size(1), num_experts)
+        << "routing_logits has incorrect shape.";
+    if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::DeepSeekV3) {
+      TVM_FFI_ICHECK_EQ(routing_logits.value().dtype(), dl_float32)
+          << "routing_logits must be float.";
+    }
+  }
+  if (routing_bias.has_value()) {
+    TVM_FFI_ICHECK(routing_bias.value().dtype() == dl_bfloat16 ||
+                   routing_bias.value().dtype() == dl_float32)
+        << "routing_bias must be bfloat16 or float.";
+
+    TVM_FFI_ICHECK_EQ(routing_bias.value().ndim(), 1) << "routing_bias must be 1D.";
+    TVM_FFI_ICHECK_EQ(routing_bias.value().size(0), num_experts)
+        << "routing_bias has incorrect shape.";
+  }
+
+  // Determine activation type
   TVM_FFI_ICHECK(gemm1_weights.dtype() == dl_uint8 && gemm2_weights.dtype() == dl_uint8)
       << "weights must be fp4 packed in uint8.";
   TVM_FFI_ICHECK(hidden_states.dtype() == dl_uint8 || hidden_states.dtype() == dl_bfloat16 ||
                  hidden_states.dtype() == dl_float8_e4m3fn)
       << "hidden_states must be bf16, fp8 or uint8 (packed fp4).";
+
   auto mDtypeAct = btg::Dtype::Bfloat16;
   if (hidden_states.dtype() == dl_uint8) {
     TVM_FFI_ICHECK(hidden_states_scale.has_value() &&
@@ -1146,67 +1729,204 @@ Array<Tensor> trtllm_fp4_block_scale_moe(
       mDtypeAct = btg::Dtype::E4m3;
     }
   }
-  bool mUseDeepSeekFp8{false};  // FP4 doesn't use DeepSeek FP8
 
-  // Properly initialize the runner using make_unique like in the original code
-  auto mRunner = std::make_unique<RunnerType>(
-      mDtypeAct, mDtypeWeights, mUseDeepSeekFp8, (int32_t)tile_tokens_dim,
-      static_cast<GatedActType>(gated_act_type), /*useShuffledMatrixA*/ true);
+  // Determine supported tile sizes
+  std::vector<int32_t> mSupportedTileN = FP4BlockScaleLauncher::getSupportedTileNums(mDtypeAct);
+  std::set<int32_t> selected_tile_nums =
+      computeSelectedTileN(mSupportedTileN, num_tokens, top_k, local_num_experts);
+
+  // Create a map of launchers for each tile size
+  std::unordered_map<int32_t, std::unique_ptr<FP4BlockScaleLauncher>> launchers_map;
+
+  for (int32_t curr_tile_N : selected_tile_nums) {
+    // Create MoE arguments for this launcher
+    auto args = std::make_unique<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>();
+    args->num_tokens = num_tokens;
+    args->num_experts = num_experts;
+    // For E2m1, hidden_size is already multiplied by 2 above, so use it directly
+    args->hidden_size = hidden_size;
+    args->hidden_size_output = args->hidden_size;
+    args->top_k = top_k;
+    args->n_group = n_group.value_or(0);
+    args->topk_group = topk_group.value_or(0);
+    args->local_expert_offset = local_expert_offset;
+    args->local_num_experts = local_num_experts;
+    args->intermediate_size = intermediate_size;
+    args->routed_scaling_factor = routed_scaling_factor.value_or(1.0);
+    args->do_finalize = do_finalize;
+    args->output = output.data_ptr();
+    args->output_scale = nullptr;
+
+    // Create and initialize launcher for this tile size
+    auto launcher = std::make_unique<FP4BlockScaleLauncher>(
+        routing_logits, routing_bias, hidden_states, hidden_states_scale, gemm1_weights,
+        gemm1_weights_scale, gemm1_bias, gemm1_alpha, gemm1_beta, gemm1_clamp_limit, gemm2_weights,
+        gemm2_weights_scale, gemm2_bias, output1_scales_scalar, output1_scales_gate_scalar,
+        output2_scales_scalar, topk_ids, expert_weights);
+    launcher->init(std::move(args), curr_tile_N, routing_method_type, /*use_shuffled_weight=*/true,
+                   /*weight_layout=*/0, gated_act_type, mDtypeAct, mDtypeWeights);
+
+    launchers_map[curr_tile_N] = std::move(launcher);
+  }
+
+  // Extract tile_N and config from config_index
+  int64_t tile_N = config_index[0];
+  int64_t config = config_index[1];
 
-  if (config_index == -1) {
-    config_index = mRunner->getDefaultValidConfigIndex(top_k, hidden_size, intermediate_size,
-                                                       local_num_experts, num_tokens);
+  // Handle default case
+  if (tile_N == -1 || config == -1) {
+    tile_N = *selected_tile_nums.begin();
+    config = -1;  // Let the runner choose default
   }
 
-  return trtllm_fp4_block_scale_moe_launcher(
-      routing_logits, topk_ids, expert_weights, routing_bias, hidden_states, hidden_states_scale,
-      gemm1_weights, gemm1_weights_scale, gemm1_bias, gemm1_alpha, gemm1_beta, gemm1_clamp_limit,
-      gemm2_weights, gemm2_weights_scale, gemm2_bias, output1_scales_scalar,
-      output1_scales_gate_scalar, output2_scales_scalar, num_experts, top_k, n_group, topk_group,
-      intermediate_size, local_expert_offset, local_num_experts, routed_scaling_factor,
-      tile_tokens_dim, routing_method_type, do_finalize, *mRunner, mDtypeAct, mDtypeWeights,
-      config_index, enable_pdl, output);
+  // Get the launcher for the selected tile_N
+  auto& selected_launcher = launchers_map.at(tile_N);
+
+  // Run the launcher - it will create its own runner internally
+  return selected_launcher->run(config, enable_pdl);
 }
 
-int64_t trtllm_get_default_moe_configs(int64_t const tile_tokens_dim, int64_t const dtype_act_,
-                                       int64_t const dtype_weights_, bool const useDeepSeekFp8,
-                                       int64_t const top_k, int64_t const hidden_size,
-                                       int64_t const intermediate_size,
-                                       int64_t const num_local_experts,
-                                       int64_t const gated_act_type, int64_t const num_tokens) {
-  auto dtype_act = static_cast<btg::Dtype>(dtype_act_);
-  auto dtype_weights = static_cast<btg::Dtype>(dtype_weights_);
-  tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner moe_runner(
-      dtype_act, dtype_weights, useDeepSeekFp8, (int32_t)tile_tokens_dim,
-      static_cast<GatedActType>(gated_act_type), /*useShuffledMatrixA*/ true);
-  return moe_runner.getDefaultValidConfigIndex(top_k, hidden_size, intermediate_size,
-                                               num_local_experts, num_tokens);
+Array<Tensor> trtllm_mxint4_block_scale_moe(
+    TensorView routing_logits, Optional<TensorView> routing_bias, TensorView hidden_states,
+    TensorView gemm1_weights, TensorView gemm1_weights_scale, Optional<TensorView> gemm1_alpha,
+    Optional<TensorView> gemm1_beta, Optional<TensorView> gemm1_clamp_limit,
+    TensorView gemm2_weights, TensorView gemm2_weights_scale, int64_t num_experts, int64_t top_k,
+    Optional<int64_t> n_group, Optional<int64_t> topk_group, int64_t intermediate_size,
+    int64_t local_expert_offset, int64_t local_num_experts, Optional<double> routed_scaling_factor,
+    int64_t routing_method_type, bool enable_pdl, TensorView output, Array<int64_t> config_index) {
+  // Determine data types based on input format
+  int const num_tokens = hidden_states.size(0);
+  int hidden_size = hidden_states.size(1);
+  // Just some basic type validation first and leave more checks to the launcher
+
+  int weight_scale_vec_size =
+      (local_num_experts * intermediate_size * 2 * hidden_size) / gemm1_weights_scale.numel();
+
+  TVM_FFI_ICHECK(weight_scale_vec_size == 32) << "unsupported weight_scale_vec_size.";
+
+  TVM_FFI_ICHECK(routing_logits.dtype() == dl_float32 || routing_logits.dtype() == dl_bfloat16)
+      << "routing_logits must be float or bfloat16.";
+  TVM_FFI_ICHECK_EQ(routing_logits.ndim(), 2) << "routing_logits must be 2D.";
+  TVM_FFI_ICHECK_EQ(routing_logits.size(1), num_experts) << "routing_logits has incorrect shape.";
+  TVM_FFI_ICHECK(!routing_bias.has_value()) << "routing_bias is not supported for MxInt4 MoE.";
+
+  // Determine activation type
+  TVM_FFI_ICHECK(gemm1_weights.dtype() == dl_uint8 && gemm2_weights.dtype() == dl_uint8)
+      << "weights must be int4 packed in uint8.";
+  TVM_FFI_ICHECK(hidden_states.dtype() == dl_bfloat16) << "hidden_states must be bf16.";
+
+  // Determine supported tile sizes
+  std::vector<int32_t> mSupportedTileN(MxInt4BlockScaleLauncher::mSupportedTileNums.begin(),
+                                       MxInt4BlockScaleLauncher::mSupportedTileNums.end());
+  std::set<int32_t> selected_tile_nums =
+      computeSelectedTileN(mSupportedTileN, num_tokens, top_k, local_num_experts);
+
+  // Create a map of launchers for each tile size
+  std::unordered_map<int32_t, std::unique_ptr<MxInt4BlockScaleLauncher>> launchers_map;
+
+  for (int32_t curr_tile_N : selected_tile_nums) {
+    // Create MoE arguments for this launcher
+    auto args = std::make_unique<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>();
+    args->num_tokens = num_tokens;
+    args->num_experts = num_experts;
+    // For E2m1, hidden_size is already multiplied by 2 above, so use it directly
+    args->hidden_size = hidden_size;
+    args->hidden_size_output = args->hidden_size;
+    args->top_k = top_k;
+    args->n_group = n_group.value_or(0);
+    args->topk_group = topk_group.value_or(0);
+    args->local_expert_offset = local_expert_offset;
+    args->local_num_experts = local_num_experts;
+    args->intermediate_size = intermediate_size;
+    args->routed_scaling_factor = routed_scaling_factor.value_or(1.0);
+    args->do_finalize = true;
+    args->output = output.data_ptr();
+    args->output_scale = nullptr;
+
+    // Create and initialize launcher for this tile size
+    auto launcher = std::make_unique<MxInt4BlockScaleLauncher>(
+        routing_logits, routing_bias, hidden_states, gemm1_weights, gemm1_weights_scale,
+        gemm1_alpha, gemm1_beta, gemm1_clamp_limit, gemm2_weights, gemm2_weights_scale);
+    launcher->init(std::move(args), curr_tile_N, routing_method_type);
+
+    launchers_map[curr_tile_N] = std::move(launcher);
+  }
+
+  // Extract tile_N and config from config_index
+  int64_t tile_N = config_index[0];
+  int64_t config = config_index[1];
+
+  // Handle default case
+  if (tile_N == -1 || config == -1) {
+    tile_N = *selected_tile_nums.begin();
+    config = -1;  // Let the runner choose default
+  }
+
+  // Get the launcher for the selected tile_N
+  auto& selected_launcher = launchers_map.at(tile_N);
+
+  // Run the launcher - it will create its own runner internally
+  return selected_launcher->run(config, enable_pdl);
 }
 
-Array<int64_t> trtllm_get_valid_moe_configs(int64_t const tile_tokens_dim, int64_t const dtype_act_,
-                                            int64_t const dtype_weights_, bool const useDeepSeekFp8,
-                                            int64_t const top_k, int64_t const hidden_size,
-                                            int64_t const intermediate_size,
-                                            int64_t const num_local_experts,
-                                            int64_t const gated_act_type,
-                                            int64_t const num_tokens) {
+Array<Array<int64_t>> trtllm_get_valid_moe_configs(
+    int64_t const dtype_act_, int64_t const dtype_weights_, bool const useDeepSeekFp8,
+    int64_t const top_k, int64_t const hidden_size, int64_t const intermediate_size,
+    int64_t const num_local_experts, int64_t const gated_act_type, bool const use_shuffled_weight,
+    int64_t const weight_layout, int64_t const num_tokens) {
   auto dtype_act = static_cast<btg::Dtype>(dtype_act_);
   auto dtype_weights = static_cast<btg::Dtype>(dtype_weights_);
-  tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner moe_runner(
-      dtype_act, dtype_weights, useDeepSeekFp8, (int32_t)tile_tokens_dim,
-      static_cast<GatedActType>(gated_act_type), /*useShuffledMatrixA*/ true);
-  return moe_runner.getValidConfigIndices(top_k, hidden_size, intermediate_size, num_local_experts,
-                                          num_tokens);
+
+  if (dtype_act == btg::Dtype::Bfloat16 && dtype_weights == btg::Dtype::MxInt4) {
+    // MxInt4 MoE
+    return MxInt4BlockScaleLauncher::getValidConfigs(top_k, hidden_size, intermediate_size,
+                                                     num_local_experts, num_tokens);
+  }
+  if (dtype_act == btg::Dtype::Bfloat16 && dtype_weights == btg::Dtype::Bfloat16) {
+    // BF16 MoE
+    return Bf16MoeLauncher::getValidConfigs(top_k, hidden_size, intermediate_size,
+                                            num_local_experts, num_tokens, gated_act_type,
+                                            use_shuffled_weight, weight_layout);
+
+  } else if (dtype_act == btg::Dtype::E4m3 && dtype_weights == btg::Dtype::E4m3) {
+    // FP8
+    if (!useDeepSeekFp8) {
+      // FP8 per-tensor scale
+      return Fp8PerTensorLauncher::getValidConfigs(
+          top_k, hidden_size, intermediate_size, num_local_experts, num_tokens, gated_act_type,
+          use_shuffled_weight, weight_layout, dtype_act, dtype_weights);
+    } else {
+      // FP8 block scale
+      return Fp8BlockScaleLauncher::getValidConfigs(
+          top_k, hidden_size, intermediate_size, num_local_experts, num_tokens, use_shuffled_weight,
+          weight_layout, dtype_weights);
+    }
+  } else if (dtype_weights == btg::Dtype::E2m1 || dtype_weights == btg::Dtype::MxE2m1) {
+    // FP4 block scale
+    return FP4BlockScaleLauncher::getValidConfigs(top_k, hidden_size, intermediate_size,
+                                                  num_local_experts, num_tokens, gated_act_type,
+                                                  dtype_act, dtype_weights);
+  }
+
+  TVM_FFI_LOG_AND_THROW(NotImplementedError)
+      << "Unsupported data type combination for getValidConfigs: "
+      << "dtype_act=" << static_cast<int>(dtype_act)
+      << ", dtype_weights=" << static_cast<int>(dtype_weights)
+      << ", useDeepSeekFp8=" << useDeepSeekFp8;
+
+  // Unreachable code - added to suppress compiler warning
+  return Array<Array<int64_t>>();
 }
 
 namespace trtllm_cubin_loader {
 #include <flashinfer/cubin_loader.h>
 }
 
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(trtllm_bf16_moe, trtllm_bf16_moe);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(trtllm_fp8_per_tensor_scale_moe, trtllm_fp8_per_tensor_scale_moe);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(trtllm_fp8_block_scale_moe, trtllm_fp8_block_scale_moe);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(trtllm_fp4_block_scale_moe, trtllm_fp4_block_scale_moe);
-TVM_FFI_DLL_EXPORT_TYPED_FUNC(trtllm_get_default_moe_configs, trtllm_get_default_moe_configs);
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(trtllm_mxint4_block_scale_moe, trtllm_mxint4_block_scale_moe);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(trtllm_get_valid_moe_configs, trtllm_get_valid_moe_configs);
 
 }  // namespace flashinfer
diff --git a/csrc/trtllm_fused_moe_routing_deepseek.cu b/csrc/trtllm_fused_moe_routing_deepseek.cu
index 527924559d..7f9a664291 100644
--- a/csrc/trtllm_fused_moe_routing_deepseek.cu
+++ b/csrc/trtllm_fused_moe_routing_deepseek.cu
@@ -392,7 +392,14 @@ __global__ void __launch_bounds__(KernelParams::MaxNumExperts)
   // Compute the runtime config for projections
   // Whether or not an expert is local is taken into account when smemExpertCount is computed
   // so we do not need to take it into account here.
-  const int32_t numCta = divUpLog2<int32_t>(count, params.mPaddingLog2);
+
+  int32_t numCta;
+  if constexpr (KernelParams::isPow2) {
+    numCta = divUpLog2<int32_t>(count, params.mPaddingLog2);
+  } else {
+    numCta = divUpTileN<int32_t>(count, params.mTileTokensDim);
+  }
+
   int32_t ctaOffset;
   int32_t numNonExitingCtas;
   Scan(tempStorage).ExclusiveSum(numCta, ctaOffset, numNonExitingCtas);
@@ -401,14 +408,31 @@ __global__ void __launch_bounds__(KernelParams::MaxNumExperts)
     const int32_t localExpertIdx =
         (threadIdx.x - params.mLocalExpertsStartIdx) >> params.mLocalExpertsStrideLog2;
     params.mPtrCtaIdxXyToBatchIdx[ctaOffset + cta] = localExpertIdx;
-    params.mPtrCtaIdxXyToMnLimit[ctaOffset + cta] =
-        min(mulLog2<int32_t>(ctaOffset + cta + 1, params.mPaddingLog2),
-            mulLog2<int32_t>(ctaOffset, params.mPaddingLog2) + count);
+    int32_t mnLimit1;
+    int32_t mnLimit2;
+    if constexpr (KernelParams::isPow2) {
+      mnLimit1 = mulLog2<int32_t>(ctaOffset + cta + 1, params.mPaddingLog2);
+      mnLimit2 = mulLog2<int32_t>(ctaOffset, params.mPaddingLog2) + count;
+    } else {
+      mnLimit1 = mulTileN<int32_t>(ctaOffset + cta + 1, params.mTileTokensDim);
+      mnLimit2 = mulTileN<int32_t>(ctaOffset, params.mTileTokensDim) + count;
+    }
+    params.mPtrCtaIdxXyToMnLimit[ctaOffset + cta] = min(mnLimit1, mnLimit2);
   }
 
   // get the padded offset associated with this expert
-  const int32_t offset = mulLog2<int32_t>(ctaOffset, params.mPaddingLog2);
-  const int32_t permutedIdxSize = mulLog2<int32_t>(numNonExitingCtas, params.mPaddingLog2);
+  int32_t offset;
+  if constexpr (KernelParams::isPow2) {
+    offset = mulLog2<int32_t>(ctaOffset, params.mPaddingLog2);
+  } else {
+    offset = mulTileN<int32_t>(ctaOffset, params.mTileTokensDim);
+  }
+  int32_t permutedIdxSize;
+  if constexpr (KernelParams::isPow2) {
+    permutedIdxSize = mulLog2<int32_t>(numNonExitingCtas, params.mPaddingLog2);
+  } else {
+    permutedIdxSize = mulTileN<int32_t>(numNonExitingCtas, params.mTileTokensDim);
+  }
 
   // write out padded count
   if (gridBlockIdx == 0 && warpIdx == NumThreads / WarpSize - 1 && cute::elect_one_sync()) {
@@ -542,8 +566,6 @@ void runImpl(Data& data, void* stream) {
   }
   FLASHINFER_CHECK(data.mNumExperts % 4 == 0,
                    "Routing kernel expects #experts %d to be a multiple of 4.", data.mNumExperts);
-  FLASHINFER_CHECK(data.mPaddingLog2 < 8, "Routing kernel expects padding log2 < 8, got %d",
-                   data.mPaddingLog2);
 
   int const numBlocks = data.mNumTokens;
   int const numThreadsHist = getMaxNumExperts(data.mNumExperts);
diff --git a/csrc/trtllm_fused_moe_routing_llama4.cu b/csrc/trtllm_fused_moe_routing_llama4.cu
index ebdd0b8720..13ca041644 100644
--- a/csrc/trtllm_fused_moe_routing_llama4.cu
+++ b/csrc/trtllm_fused_moe_routing_llama4.cu
@@ -189,7 +189,13 @@ __global__ void __launch_bounds__(WarpSize) routingIndicesWarpKernel(KernelParam
 #pragma unroll
   for (int ii = 0; ii < ExpertsPerThread; ++ii) {
     auto count = getBits(expertCount, ii);
-    numCta += divUpLog2<int32_t>(count, params.mPaddingLog2);
+    int32_t num;
+    if constexpr (KernelParams::isPow2) {
+      num = divUpLog2<int32_t>(count, params.mPaddingLog2);
+    } else {
+      num = divUpTileN<int32_t>(count, params.mTileTokensDim);
+    }
+    numCta += num;
   }
   // second, we perform the exclusive sum across the warp
   int32_t ctaOffset;
@@ -202,22 +208,39 @@ __global__ void __launch_bounds__(WarpSize) routingIndicesWarpKernel(KernelParam
 #pragma unroll
   for (int ii = 0; ii < ExpertsPerThread; ++ii) {
     auto count = getBits(expertCount, ii);
-    auto finalNumCta = divUpLog2<int32_t>(count, params.mPaddingLog2);
+    int32_t finalNumCta;
+    if constexpr (KernelParams::isPow2) {
+      finalNumCta = divUpLog2<int32_t>(count, params.mPaddingLog2);
+    } else {
+      finalNumCta = divUpTileN<int32_t>(count, params.mTileTokensDim);
+    }
     auto expertIdx = threadIdx.x * ExpertsPerThread + ii;
     // during the scan for expert offsets, we can already write out
     // both `mPtrCtaIdxXyToBatchIdx` and `mPtrCtaIdxXyToMnLimit`
     for (int cta = 0; cta < finalNumCta; ++cta) {
       params.mPtrCtaIdxXyToBatchIdx[ctaOffsetExp + cta] = expertIdx;
-      params.mPtrCtaIdxXyToMnLimit[ctaOffsetExp + cta] =
-          min(mulLog2<int32_t>(ctaOffsetExp + cta + 1, params.mPaddingLog2),
-              mulLog2<int32_t>(ctaOffsetExp, params.mPaddingLog2) + count);
+      int32_t mnLimit1;
+      int32_t mnLimit2;
+      if constexpr (KernelParams::isPow2) {
+        mnLimit1 = mulLog2<int32_t>(ctaOffsetExp + cta + 1, params.mPaddingLog2);
+        mnLimit2 = mulLog2<int32_t>(ctaOffsetExp, params.mPaddingLog2) + count;
+      } else {
+        mnLimit1 = mulTileN<int32_t>(ctaOffsetExp + cta + 1, params.mTileTokensDim);
+        mnLimit2 = mulTileN<int32_t>(ctaOffsetExp, params.mTileTokensDim) + count;
+      }
+      params.mPtrCtaIdxXyToMnLimit[ctaOffsetExp + cta] = min(mnLimit1, mnLimit2);
     }
     ctaOffsetExp += finalNumCta;
   }
 
   // at this point, we can write out padded count from the warp-aggregate
   if (cute::elect_one_sync()) {
-    const int32_t permutedIdxSize = mulLog2<int32_t>(numNonExitingCtas, params.mPaddingLog2);
+    int32_t permutedIdxSize;
+    if constexpr (KernelParams::isPow2) {
+      permutedIdxSize = mulLog2<int32_t>(numNonExitingCtas, params.mPaddingLog2);
+    } else {
+      permutedIdxSize = mulTileN<int32_t>(numNonExitingCtas, params.mTileTokensDim);
+    }
     params.mPtrPermutedIdxSize[0] = permutedIdxSize;
     params.mPtrNumNonExitingCtas[0] = numNonExitingCtas;
   }
@@ -236,12 +259,20 @@ __global__ void __launch_bounds__(WarpSize) routingIndicesWarpKernel(KernelParam
   // of registers
   auto localExpertExtent = params.mNumLocalExperts << params.mLocalExpertsStrideLog2;
   int32_t finalExpertOffset[ExpertsPerThread];
-  finalExpertOffset[0] = mulLog2<int32_t>(ctaOffset, params.mPaddingLog2);
+  if constexpr (KernelParams::isPow2) {
+    finalExpertOffset[0] = mulLog2<int32_t>(ctaOffset, params.mPaddingLog2);
+  } else {
+    finalExpertOffset[0] = mulTileN<int32_t>(ctaOffset, params.mTileTokensDim);
+  }
 #pragma unroll
   for (int ii = 1; ii < ExpertsPerThread; ++ii) {
-    finalExpertOffset[ii] =
-        finalExpertOffset[ii - 1] +
-        divUpMulLog2<int32_t>(getBits(expertCount, ii - 1), params.mPaddingLog2);
+    int32_t tmp;
+    if constexpr (KernelParams::isPow2) {
+      tmp = divUpMulLog2<int32_t>(getBits(expertCount, ii - 1), params.mPaddingLog2);
+    } else {
+      tmp = divUpMulTileN<int32_t>(getBits(expertCount, ii - 1), params.mTileTokensDim);
+    }
+    finalExpertOffset[ii] = finalExpertOffset[ii - 1] + tmp;
   }
 
 #pragma unroll
@@ -455,8 +486,6 @@ void runImpl(Data const& data, void* stream) {
                    NumExpertsLimit);
   FLASHINFER_CHECK(data.mNumExperts % 4 == 0,
                    "Routing kernel expects #experts %d to be a multiple of 4.", data.mNumExperts);
-  FLASHINFER_CHECK(data.mPaddingLog2 < 8, "Routing kernel expects padding log2 < 8, got %d",
-                   data.mPaddingLog2);
 
   bool const useSingleWarp =
       (data.mPtrScores == nullptr && data.mNumTokens <= WarpKernelMaxNumTokens) ||
diff --git a/csrc/trtllm_fused_moe_routing_renormalize.cu b/csrc/trtllm_fused_moe_routing_renormalize.cu
index 1a4823d481..40d0fe90cb 100644
--- a/csrc/trtllm_fused_moe_routing_renormalize.cu
+++ b/csrc/trtllm_fused_moe_routing_renormalize.cu
@@ -143,6 +143,18 @@ __global__ void __launch_bounds__(KernelParams::MaxNumExperts)
         }
       }
     }  // end if (validToken)
+  } else if (params.mPtrTopKPacked != nullptr) {
+    if (validToken) {
+      if (laneIdx < params.mTopK) {
+        int offset = warpIdx * MaxNumExperts +
+                     static_cast<int>(params.mPtrTopKPacked[warpIdx * params.mTopK + laneIdx].idx);
+        smemKIdx[offset] = static_cast<int8_t>(laneIdx);
+        if (params.mPtrTopKWeights != nullptr) {
+          params.mPtrTopKWeights[warpIdx * params.mTopK + laneIdx] =
+              static_cast<OutputT>(params.mPtrTopKPacked[warpIdx * params.mTopK + laneIdx].score);
+        }
+      }
+    }
   }
   __syncthreads();
 
@@ -165,14 +177,24 @@ __global__ void __launch_bounds__(KernelParams::MaxNumExperts)
   }
   __syncthreads();
   // Get the number of CTAs and the offset for each CTA
-  const int32_t numCta = divUpLog2<int32_t>(accExpertCount, params.mPaddingLog2);
+  int32_t numCta;
+  if constexpr (KernelParams::isPow2) {
+    numCta = divUpLog2<int32_t>(accExpertCount, params.mPaddingLog2);
+  } else {
+    numCta = divUpTileN<int32_t>(accExpertCount, params.mTileTokensDim);
+  }
   int32_t ctaOffset = 0;
   int32_t numNonExitingCtas;
   Scan(tempStorage).ExclusiveSum(numCta, ctaOffset, numNonExitingCtas);
 
   int32_t expertScanCounts = 0;
-  Scan(tempStorage)
-      .ExclusiveSum(divUpMulLog2(accExpertCount, params.mPaddingLog2), expertScanCounts);
+  int32_t tmpCount;
+  if constexpr (KernelParams::isPow2) {
+    tmpCount = divUpMulLog2<int32_t>(accExpertCount, params.mPaddingLog2);
+  } else {
+    tmpCount = divUpMulTileN<int32_t>(accExpertCount, params.mTileTokensDim);
+  }
+  Scan(tempStorage).ExclusiveSum(tmpCount, expertScanCounts);
   __syncthreads();
 
   if (isLocalExpert) {
@@ -180,15 +202,27 @@ __global__ void __launch_bounds__(KernelParams::MaxNumExperts)
       const int32_t localExpertIdx =
           (expert - params.mLocalExpertsStartIdx) >> params.mLocalExpertsStrideLog2;
       params.mPtrCtaIdxXyToBatchIdx[ctaOffset + cta] = localExpertIdx;
-      params.mPtrCtaIdxXyToMnLimit[ctaOffset + cta] =
-          min(mulLog2<int32_t>(ctaOffset + cta + 1, params.mPaddingLog2),
-              mulLog2<int32_t>(ctaOffset, params.mPaddingLog2) + accExpertCount);
+      int32_t mnLimit1;
+      int32_t mnLimit2;
+      if constexpr (KernelParams::isPow2) {
+        mnLimit1 = mulLog2<int32_t>(ctaOffset + cta + 1, params.mPaddingLog2);
+        mnLimit2 = mulLog2<int32_t>(ctaOffset, params.mPaddingLog2) + accExpertCount;
+      } else {
+        mnLimit1 = mulTileN<int32_t>(ctaOffset + cta + 1, params.mTileTokensDim);
+        mnLimit2 = mulTileN<int32_t>(ctaOffset, params.mTileTokensDim) + accExpertCount;
+      }
+      params.mPtrCtaIdxXyToMnLimit[ctaOffset + cta] = min(mnLimit1, mnLimit2);
     }
   }
 
   // at this point, we can write out padded count
   if (threadIdx.x == 0) {
-    const int32_t permutedIdxSize = mulLog2<int32_t>(numNonExitingCtas, params.mPaddingLog2);
+    int32_t permutedIdxSize;
+    if constexpr (KernelParams::isPow2) {
+      permutedIdxSize = mulLog2<int32_t>(numNonExitingCtas, params.mPaddingLog2);
+    } else {
+      permutedIdxSize = mulTileN<int32_t>(numNonExitingCtas, params.mTileTokensDim);
+    }
     params.mPtrPermutedIdxSize[0] = permutedIdxSize;
     params.mPtrNumNonExitingCtas[0] = numNonExitingCtas;
   }
@@ -399,10 +433,10 @@ void run(Data const& data, void* stream) {
       << NumExpertsLimit << ".";
   TVM_FFI_ICHECK_EQ(data.mNumExperts % 4, 0)
       << "Routing kernel expects #experts " << data.mNumExperts << " to be a multiple of 4.";
-  TVM_FFI_ICHECK_LE(data.mPaddingLog2, 8)
-      << "Routing kernel expects padding log2 < 8, got " << data.mPaddingLog2;
 
-  bool const useSingleBlock = data.mNumTokens <= BlockKernelMaxNumTokens;
+  // FIXME: routingIndicesBlockKernel breaks the vllm + gpt-oss DeepEP
+  bool const useSingleBlock =
+      data.mNumTokens <= BlockKernelMaxNumTokens && data.mPtrTopKPacked == nullptr;
 
   bool const useSingleCluster =
       data.mNumTokens <= ((data.mPtrScores != nullptr || data.mPtrTopKIds != nullptr)
diff --git a/csrc/trtllm_fused_moe_runner.cu b/csrc/trtllm_fused_moe_runner.cu
index a33843516e..b5ff5757c9 100644
--- a/csrc/trtllm_fused_moe_runner.cu
+++ b/csrc/trtllm_fused_moe_runner.cu
@@ -23,7 +23,6 @@
 #include "flashinfer/trtllm/fused_moe/DevKernel.h"
 #include "flashinfer/trtllm/fused_moe/RoutingKernel.h"
 #include "flashinfer/trtllm/fused_moe/runner.h"
-// #include <tensorrt_llm/common/assert.h>
 
 namespace tensorrt_llm {
 namespace kernels {
@@ -39,7 +38,9 @@ inline int32_t computeLog2(int32_t val, std::string const& name = "") {
   while (n >>= 1) {
     ++out;
   }
-  FLASHINFER_CHECK((1 << out) == val, "Expected ", name, " to be a power of 2, got ", val);
+  if ((1 << out) != val) {
+    out = -1;
+  }
   return out;
 }
 }  // namespace
@@ -90,6 +91,7 @@ void Runner::run(void* routingLogits, void* routingBias, int32_t numTokens, int3
     routingData.mNumLimitedGroups = topkGroup;
     routingData.mTopK = topK;
     routingData.mPaddingLog2 = computeLog2(mTileTokensDim);
+    routingData.mTileTokensDim = mTileTokensDim;
     routingData.mLocalExpertsStartIdx = localExpertOffset;
     routingData.mLocalExpertsStrideLog2 = 0;
     routingData.mNumLocalExperts = localNumExperts;
@@ -124,6 +126,7 @@ void Runner::run(void* routingLogits, void* routingBias, int32_t numTokens, int3
     routingData.mNumExperts = numExperts;
     routingData.mTopK = topK;
     routingData.mPaddingLog2 = computeLog2(mTileTokensDim);
+    routingData.mTileTokensDim = mTileTokensDim;
     routingData.mLocalExpertsStartIdx = localExpertOffset;
     routingData.mLocalExpertsStrideLog2 = 0;
     routingData.mNumLocalExperts = localNumExperts;
@@ -170,6 +173,7 @@ void Runner::run(void* routingLogits, void* routingBias, int32_t numTokens, int3
     routingData.mNumExperts = numExperts;
     routingData.mTopK = topK;
     routingData.mPaddingLog2 = computeLog2(mTileTokensDim);
+    routingData.mTileTokensDim = mTileTokensDim;
     routingData.mLocalExpertsStartIdx = localExpertOffset;
     routingData.mLocalExpertsStrideLog2 = 0;
     routingData.mNumLocalExperts = localNumExperts;
@@ -514,7 +518,7 @@ void Runner::run(MoERunnerArgs const& args, MoEWorkspace const& workspace, int d
   auto const& config = mPassingConfigs[configIndex];
 
   mPermuteGemm1.run(args.hidden_states, hidden_states_scale_linear, args.gemm1_weights,
-                    args.gemm1_weights_scale, workspace.expert_weights, args.output1_scales_scalar,
+                    args.gemm1_weights_scale, workspace.token_scales, args.output1_scales_scalar,
                     args.output1_scales_gate_scalar, args.gemm1_bias, args.gemm1_alpha,
                     args.gemm1_beta, args.gemm1_clamp_limit, workspace.gemm1_output,
                     workspace.gemm1_output_scale, args.top_k, args.hidden_size,
diff --git a/csrc/trtllm_mnnvl_allreduce.cu b/csrc/trtllm_mnnvl_allreduce.cu
index 6bac5372a8..7c151fd91d 100644
--- a/csrc/trtllm_mnnvl_allreduce.cu
+++ b/csrc/trtllm_mnnvl_allreduce.cu
@@ -30,7 +30,7 @@ void trtllm_mnnvl_all_reduce(TensorView in, int64_t multicast_buffer_ptr, int64_
                              int64_t buffer_M, TensorView buffer_flags_mnnvl, int64_t nranks,
                              int64_t rank, bool wait_for_results, bool launch_with_pdl,
                              Optional<TensorView> out) {
-  cudaSetDevice(in.device().device_id);
+  ffi::CUDADeviceGuard device_guard(in.device().device_id);
   auto stream = get_stream(in.device());
 
   DISPATCH_FLOATING_TYPES_FOR_MNNVL_ALLREDUCE(in.dtype(), c_type, [&] {
@@ -74,7 +74,7 @@ void trtllm_mnnvl_all_reduce(TensorView in, int64_t multicast_buffer_ptr, int64_
 void trtllm_mnnvl_rmsnorm(int64_t multicast_buffer_ptr, TensorView prenorm_output,
                           TensorView normed_output, TensorView gamma, double epsilon,
                           TensorView residual, TensorView buffer_flags, bool launch_with_pdl) {
-  cudaSetDevice(prenorm_output.device().device_id);
+  ffi::CUDADeviceGuard device_guard(prenorm_output.device().device_id);
   auto stream = get_stream(prenorm_output.device());
 
   DISPATCH_FLOATING_TYPES_FOR_MNNVL_ALLREDUCE(prenorm_output.dtype(), c_type, [&] {
diff --git a/csrc/trtllm_moe_allreduce_fusion.cu b/csrc/trtllm_moe_allreduce_fusion.cu
index de12dad4f8..ac1ce1714d 100644
--- a/csrc/trtllm_moe_allreduce_fusion.cu
+++ b/csrc/trtllm_moe_allreduce_fusion.cu
@@ -32,7 +32,7 @@ void trtllm_moe_allreduce_fusion(
     TensorView moe_reduction_token_input, Optional<int64_t> layout_code,
     Optional<TensorView> moe_allreduce_out, Optional<TensorView> residual_out,
     Optional<TensorView> norm_out, Optional<TensorView> quant_out, Optional<TensorView> scale_out) {
-  cudaSetDevice(moe_reduction_active_experts_token_input.device().device_id);
+  ffi::CUDADeviceGuard device_guard(moe_reduction_active_experts_token_input.device().device_id);
   auto stream = get_stream(moe_reduction_active_experts_token_input.device());
 
   DISPATCH_FLOATING_TYPES_FOR_ALLREDUCE(
diff --git a/csrc/tvm_ffi_utils.h b/csrc/tvm_ffi_utils.h
index 402c9933dd..3c227e22ec 100644
--- a/csrc/tvm_ffi_utils.h
+++ b/csrc/tvm_ffi_utils.h
@@ -18,6 +18,7 @@
 #include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/extra/cuda/device_guard.h>
 #include <tvm/ffi/function.h>
 
 #include "dlpack/dlpack.h"
@@ -254,6 +255,10 @@ inline void check_shape(const tvm::ffi::TensorView& a, const tvm::ffi::TensorVie
   CHECK_CUDA(x);                    \
   CHECK_CONTIGUOUS(x);              \
   CHECK_INPUT_TYPE(x, st)
+#define CHECK_MAYBE_INPUT_TYPE(maybe_x, st) \
+  if (maybe_x.has_value()) {                \
+    CHECK_INPUT_TYPE(maybe_x.value(), st);  \
+  }
 #define CHECK_LAST_DIM_CONTIGUOUS_INPUT(x) \
   CHECK_CUDA(x);                           \
   CHECK_LAST_DIM_CONTIGUOUS(x)
diff --git a/csrc/vllm_custom_all_reduce.cu b/csrc/vllm_custom_all_reduce.cu
index 49fbefebc0..55694b0ca4 100644
--- a/csrc/vllm_custom_all_reduce.cu
+++ b/csrc/vllm_custom_all_reduce.cu
@@ -66,7 +66,7 @@ bool _is_weak_contiguous(TensorView t) {
 void all_reduce(fptr_t _fa, TensorView inp, TensorView out, fptr_t _reg_buffer,
                 int64_t reg_buffer_sz_bytes, int64_t num_ctas) {
   auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
-  cudaSetDevice(inp.device().device_id);
+  ffi::CUDADeviceGuard device_guard(inp.device().device_id);
   auto stream = get_stream(inp.device());
 
   TVM_FFI_ICHECK_EQ(inp.dtype(), out.dtype());
diff --git a/csrc/xqa/barriers.cuh b/csrc/xqa/barriers.cuh
index c65b755294..ad5a77a72e 100644
--- a/csrc/xqa/barriers.cuh
+++ b/csrc/xqa/barriers.cuh
@@ -1,13 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #pragma once
diff --git a/csrc/xqa/cuda_hint.cuh b/csrc/xqa/cuda_hint.cuh
index d6e2af86eb..8007e4c3d4 100644
--- a/csrc/xqa/cuda_hint.cuh
+++ b/csrc/xqa/cuda_hint.cuh
@@ -1,13 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights
- * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #pragma once
diff --git a/csrc/xqa/defines.h b/csrc/xqa/defines.h
index 6f0acc4f85..3794708a3b 100644
--- a/csrc/xqa/defines.h
+++ b/csrc/xqa/defines.h
@@ -1,13 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights
- * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #pragma once
@@ -92,21 +97,6 @@ static_assert(SPEC_DEC, "SPEC_Q_SEQ_LEN should only be used when SPEC_DEC is ena
 #define TOKENS_PER_PAGE 32
 #endif
 
-// don't modify
-#ifndef USE_PAGED_KV_CACHE
-#define USE_PAGED_KV_CACHE (TOKENS_PER_PAGE > 0)
-#endif
-
-// Paged KV Cache Format
-// 0 - XQA Original
-// 1 - separate K and V cache pools, each with layout (batch, seq_len, head, head_elem) for
-// VLLM/SGLang
-#ifdef USE_PAGED_KV_CACHE
-#ifndef PAGED_KV_CACHE_LAYOUT
-#define PAGED_KV_CACHE_LAYOUT 0
-#endif
-#endif
-
 // don't modify
 #define USE_BEAM_SEARCH (BEAM_WIDTH > 1)
 
@@ -129,7 +119,16 @@ static_assert(SPEC_DEC, "SPEC_Q_SEQ_LEN should only be used when SPEC_DEC is ena
 // 1 - naive PDL
 // 2 - aggressive PDL (implemented only in mha_sm90.cu for now)
 #ifndef ENABLE_PDL
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+#if __CUDA_ARCH__ == 900
 #define ENABLE_PDL 2
+#else
+#define ENABLE_PDL 1
+#endif
+#else
+/* default for host or older architectures */
+#define ENABLE_PDL 0
+#endif
 #endif
 
 #ifndef USE_INPUT_KV
@@ -161,8 +160,7 @@ static_assert(CACHE_ELEM_ENUM != 0);
 #endif
 
 // true should be better if warpTile.x * cacheElemSize < 128. otherwise use false.
-#define GRP_LOAD_V \
-  (CACHE_ELEM_ENUM != 0) || (HEAD_ELEMS == 256 && USE_PAGED_KV_CACHE && BEAM_WIDTH > 1)
+#define GRP_LOAD_V (CACHE_ELEM_ENUM != 0) || (HEAD_ELEMS == 256 && BEAM_WIDTH > 1)
 
 // use custom barrier for NVRTC to avoid pulling in many headers
 #ifndef USE_CUSTOM_BARRIER
diff --git a/csrc/xqa/gmma.cuh b/csrc/xqa/gmma.cuh
index d1b2547fcd..a62f34a434 100644
--- a/csrc/xqa/gmma.cuh
+++ b/csrc/xqa/gmma.cuh
@@ -1,13 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #pragma once
diff --git a/csrc/xqa/gmma_impl.cuh b/csrc/xqa/gmma_impl.cuh
index b9515ddea9..6c47fa0bf9 100644
--- a/csrc/xqa/gmma_impl.cuh
+++ b/csrc/xqa/gmma_impl.cuh
@@ -1,13 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #pragma once
diff --git a/csrc/xqa/ldgsts.cuh b/csrc/xqa/ldgsts.cuh
index 779a13429c..86ac6b7f7b 100644
--- a/csrc/xqa/ldgsts.cuh
+++ b/csrc/xqa/ldgsts.cuh
@@ -1,13 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights
- * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #pragma once
diff --git a/csrc/xqa/mha.cu b/csrc/xqa/mha.cu
index cf6778fb20..c8d6ca2c22 100644
--- a/csrc/xqa/mha.cu
+++ b/csrc/xqa/mha.cu
@@ -1,13 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights
- * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #include "cuda_hint.cuh"
@@ -89,11 +94,11 @@ constexpr uint32_t cvtExpansion = exactDiv(inputElemSize, cacheElemSize);
 constexpr uint32_t preferedKHeadPartBytes = 64;
 __constant__ constexpr uint32_t cacheVTileSeqLen = 32;
 #else
-#if __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 890 || __CUDA_ARCH__ == 1200
+#if __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 890 || __CUDA_ARCH__ == 1200 || __CUDA_ARCH__ == 1210
 constexpr uint32_t preferedKHeadPartBytes = 64;
 __constant__ constexpr uint32_t cacheVTileSeqLen = 32;
 #elif __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 900 || \
-    __CUDA_ARCH__ == 1000 || __CUDA_ARCH__ == 1030
+    __CUDA_ARCH__ == 1000 || __CUDA_ARCH__ == 1030 || __CUDA_ARCH__ == 1100
 constexpr uint32_t preferedKHeadPartBytes = 128;
 __constant__ constexpr uint32_t cacheVTileSeqLen = 64;
 #else
@@ -293,14 +298,12 @@ constexpr uint32_t nbCacheVTilesPerXTile = exactDiv(warpTile.x, cacheVTileSeqLen
 
 constexpr uint32_t nbWarpGrpsPerXTile = mha::min(nbCacheVTilesPerXTile, gemm1NbWarpGrps);
 
-#if USE_PAGED_KV_CACHE
 constexpr uint32_t nbPagesPerWarpTile =
     (warpTile.x <= tokensPerPage ? 1U : exactDiv(warpTile.x, tokensPerPage));
 using KCachePageIndices = Vec<KVCachePageIndex, nbPagesPerWarpTile>;
 constexpr uint32_t nbPagesPerVTile =
     (cacheVTileSeqLen <= tokensPerPage ? 1 : exactDiv(cacheVTileSeqLen, tokensPerPage));
 using VCachePageIndices = Vec<KVCachePageIndex, nbPagesPerVTile>;
-#endif
 
 static_assert(ctaShapeInWarps.y == 1);
 
@@ -336,10 +339,8 @@ struct alignas(128) SharedMem {
 #if BEAM_WIDTH > 1
   Vec<uint32_t, warpTile.x> gemm0CacheIndir[ctaShapeInWarps.x];
   Vec<uint32_t, cacheVTileSeqLen> gemm1CacheIndir[grpLoadV ? gemm1NbWarpGrps : ctaShapeInWarps.x];
-#if USE_PAGED_KV_CACHE
   Vec<KCachePageIndices, beamWidth> kCachePages[ctaShapeInWarps.x];
   Vec<VCachePageIndices, beamWidth> vCachePages[grpLoadV ? gemm1NbWarpGrps : ctaShapeInWarps.x];
-#endif
 #endif
 
   using Barrier = CtaBarrier;
@@ -1266,6 +1267,23 @@ __device__ inline void addAttentionSinks(ThrdRegRowMax& globalRowSum,
   }
 }
 
+#if SPEC_DEC
+// SPEC_DEC version: handles head-token mixed layout
+__device__ inline void addAttentionSinksSpecDec(ThrdRegRowMax& globalRowSum,
+                                                ThrdRegRowMax const globalRowMax,
+                                                float const* attentionSinks, uint32_t headGrpSize) {
+  for (uint32_t i = 0; i < globalRowSum.size; i++) {
+    uint32_t idxHeadToken = warp_size * i + laneId();
+    // In SPEC_DEC, layout is [token0_head0, token0_head1, ..., token1_head0, ...]
+    // Extract head index from head-token index
+    uint32_t headIdx = idxHeadToken % headGrpSize;
+    if (headIdx < headGrpSize && idxHeadToken < rowsPerBlock) {
+      globalRowSum[i] += expf(attentionSinks[headIdx] - globalRowMax[i]);
+    }
+  }
+}
+#endif
+
 #ifdef NDEBUG
 __device__ __forceinline__
 #else
@@ -1282,10 +1300,10 @@ CUBIN_EXPORT __global__
 #if SLIDING_WINDOW
         uint32_t slidingWinSize,
 #endif
-        float qScale,
+        float qScale, float const* qScalePtr,
         OutputHead* __restrict__ const output,  // [nbReq][beamWidth][nbQHeads]
 #if LOW_PREC_OUTPUT
-        float const* rcpOutScale,
+        float rcpOutScale,
 #endif
         // NOTE: the input is actually Q buffer when integrated to TRT-LLM.
         IOHead const* __restrict__ const q,  // [nbReq][beamWidth][nbQHeads],
@@ -1304,10 +1322,13 @@ CUBIN_EXPORT __global__
         BeamSearchParams const beamSearchParams,
 #endif
 #endif
-        uint32_t const batchSize,
-        float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V
-                                                 // cache. Used only for int8/fp8 KV cache.
+        uint32_t const batchSize, float kvCacheScale,
+        float const* kvScalePtr,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
+        uint32_t kv_stride_page, uint32_t kv_stride_token, uint32_t kv_stride_head,
         uint32_t* __restrict__ semaphores = nullptr, void* __restrict__ scratch = nullptr) {
+
+  float const qScaleValue = qScalePtr != nullptr ? qScalePtr[0] : qScale;
+  float const kvCacheScaleValue = kvScalePtr != nullptr ? kvScalePtr[0] : kvCacheScale;
   assert(allowMultiBlockMode || gridDim.x == 1);
   bool const isMultiBlock = allowMultiBlockMode && (gridDim.x != 1);
   uint32_t const nbSubSeqPerSeq = allowMultiBlockMode ? gridDim.x : 1;
@@ -1483,10 +1504,8 @@ CUBIN_EXPORT __global__
 #endif
   uint32_t const nbSkipLeadingTiles = nbTotalSkipTokens / ctaTile.x;
   uint32_t const tile0NbSkipTokens = nbTotalSkipTokens % ctaTile.x;
-#if USE_PAGED_KV_CACHE
   uint32_t const nbPages = divUp(cacheSeqLen, tokensPerPage);
   constexpr uint32_t nbPagesPerCtaTile = exactDiv(ctaTile.x, tokensPerPage);
-#endif
 
   uint32_t const nbSeqIters = useKVCache ? divUp(cacheSeqLen, ctaTile.x) : 0;
 #if SPEC_DEC
@@ -1508,7 +1527,7 @@ CUBIN_EXPORT __global__
   };
   if (warpIdx.z == 0) {
     float const qkScale =
-        qScale * (isKVCacheQuantized ? kvCacheScale[0] : 1.f) *
+        qScaleValue * (isKVCacheQuantized ? kvCacheScaleValue : 1.f) *
         rsqrtf(validElemsPerHead);  // qkScale is applied onto Q*K.T before softmax.
     CircIdx<nbKBuffers> idxCurrSMemKBuf{nbKBuffers - 1};
     auto const getSMemKTile = [&](uint32_t idx) -> SharedMem::KSmemBuffer& {
@@ -1523,7 +1542,6 @@ CUBIN_EXPORT __global__
     };
     loadCacheIndir(seqIterInit, 0U);
 #endif
-#if USE_PAGED_KV_CACHE
 #if BEAM_WIDTH == 1
     KCachePageIndices pageIdx = KCachePageIndices::filled(kBAD_PAGE_INDEX);
 #endif
@@ -1539,11 +1557,6 @@ CUBIN_EXPORT __global__
     };
     uint32_t idxPageBeg = nbPagesPerCtaTile * seqIterInit + warpIdx.x * warpTile.x / tokensPerPage;
     loadPages(idxPageBeg);
-#else
-    constexpr uint32_t idxBeamBase = 0U;
-    uint32_t const cacheKSeqBaseOffset =
-        cacheList.capacity * (idxHeadGrp + nbKHeads * 2 * (idxBeamBase + beamWidth * idxReq));
-#endif
     auto loadKTilePart = [&](uint32_t seqIter, uint32_t idxBeam, uint32_t idxPart) mutable {
       assert(idxBeam < beamWidth);
       assert(seqIter % nbSubSeqPerSeq == seqIterInit % nbSubSeqPerSeq);
@@ -1551,46 +1564,22 @@ CUBIN_EXPORT __global__
       auto& dst = getSMemKTile(idxNextSMemKBuf);
       uint32_t const dstHeadOffset = 0;
       uint32_t const seqOffset = ctaTile.x * seqIter + warpTile.x * warpIdx.x;
-#if USE_PAGED_KV_CACHE
-#if PAGED_KV_CACHE_LAYOUT == 1
-      uint32_t const idxHeadBeg = (seqOffset % tokensPerPage) * nbKHeads + idxHeadGrp;
+      uint32_t const tokenOffset = seqOffset % tokensPerPage;
 
-#else
-      uint32_t const idxHeadBeg = tokensPerPage * idxHeadGrp + seqOffset % tokensPerPage;
-#endif
 #if BEAM_WIDTH == 1
-#if PAGED_KV_CACHE_LAYOUT == 1
-      HeadPtr<GMemCacheHead const, tokensPerPage, nbPagesPerWarpTile> const src{
-          cacheList.kCacheVLLM, pageIdx, nbKHeads, idxHeadBeg};
-#else
       HeadPtr<GMemCacheHead const, tokensPerPage, nbPagesPerWarpTile> const src{
-          cacheList.pool, pageIdx, nbKHeads, idxHeadBeg};
-#endif
+          cacheList.kCacheVLLM, pageIdx,         tokenOffset,   idxHeadGrp,
+          kv_stride_page,       kv_stride_token, kv_stride_head};
 #else
       IndexedHeadPtr<GMemCacheHead const, tokensPerPage, nbPagesPerWarpTile> const src{
           /*indices=*/smem.gemm0CacheIndir[warpIdx.x].data,
-#if PAGED_KV_CACHE_LAYOUT == 1
           /*pool=*/cacheList.kCacheVLLM,
-#else
-          /*pool=*/cacheList.pool,
-#endif
           /*pageIndices=*/smem.kCachePages[warpIdx.x].data,
-          /*nbKHeads=*/nbKHeads,
-          /*offset=*/idxHeadBeg};
-#endif
-#else
-      uint32_t const idxHeadBeg = cacheKSeqBaseOffset + seqOffset;
-#if BEAM_WIDTH == 1
-      TinyPtr<GMemCacheHead const> const src{cacheList.data, idxHeadBeg};
-#else
-      IndexedHeadPtr<GMemCacheHead const, 0, 0> const src{
-          /*indices=*/smem.gemm0CacheIndir[warpIdx.x].data,
-          /*pointer=*/cacheList.data,
-          /*offset=*/idxHeadBeg,
-          /*beamStride=*/cacheList.capacity * nbKHeads * 2};
-      // trap();
-      // assert("not implemented");
-#endif
+          /*tokenOffset=*/tokenOffset,
+          /*headIdx=*/idxHeadGrp,
+          /*stride_page=*/kv_stride_page,
+          /*stride_token=*/kv_stride_token,
+          /*stride_head=*/kv_stride_head};
 #endif
       // if (threadIdx.x == dbgPrintTid) {
       //     printf("K: seqIter=%u, idxBeam=%u, idxPart=%u: pointers={%p, %p}, indices={", seqIter,
@@ -1618,13 +1607,11 @@ CUBIN_EXPORT __global__
       __syncwarp();
 #endif
       if (idxPart + 1 == nbPartsPerCacheKHead) {
-#if USE_PAGED_KV_CACHE
         bool const isForNextSeqIter = isConvergedTile(seqIter) || idxBeam == beamWidth - 1;
         if (isForNextSeqIter) {
           idxPageBeg += nbPagesPerCtaTile * nbSubSeqPerSeq;
           loadPages(idxPageBeg);
         }
-#endif
 #if BEAM_WIDTH > 1
         uint32_t idxBeamNext, seqIterDelta;
         mha::tie(idxBeamNext, seqIterDelta) =
@@ -1831,7 +1818,6 @@ CUBIN_EXPORT __global__
     auto const getSmemVBar = [&](uint32_t idx) -> SharedMem::Barrier* {
       return smem.vBarrier(warpGrpIdx, idx);
     };
-#if USE_PAGED_KV_CACHE
 #if BEAM_WIDTH == 1
     VCachePageIndices pageIdx = VCachePageIndices::filled(kBAD_PAGE_INDEX);
 #endif
@@ -1849,12 +1835,6 @@ CUBIN_EXPORT __global__
     uint32_t idxPageBeg =
         nbPagesPerCtaTile * seqIterInit + cacheVTileSeqLen * warpGrpIdx / tokensPerPage;
     loadPages(idxPageBeg);
-#else
-    uint32_t const idxBeamBase = 0;
-    uint32_t const cacheVSeqBaseOffset =
-        cacheList.capacity *
-        (nbKHeads + idxHeadGrp + nbKHeads * 2 * (idxBeamBase + beamWidth * idxReq));
-#endif
     auto nextStep = [&](uint32_t seqIter, uint32_t xIter, uint32_t vIter, uint32_t idxBeam) {
       uint32_t vIterNext, isNextBeam;
       mha::tie(vIterNext, isNextBeam) = carryLE<nbVItersPerXIter>(vIter + 1, 0);
@@ -1881,44 +1861,22 @@ CUBIN_EXPORT __global__
 
       uint32_t const seqOffset = ctaTile.x * seqIter + warpTile.x * nbXTilesPerXIter * xIter +
                                  cacheVTileSeqStride * vIter + cacheVTileSeqLen * warpGrpIdx;
-#if USE_PAGED_KV_CACHE
-#if PAGED_KV_CACHE_LAYOUT == 1
-      uint32_t const idxHeadBeg = (seqOffset % tokensPerPage) * nbKHeads + idxHeadGrp;
+      uint32_t const tokenOffset = seqOffset % tokensPerPage;
 
-#else
-      uint32_t const idxHeadBeg = tokensPerPage * idxHeadGrp + seqOffset % tokensPerPage;
-#endif
 #if BEAM_WIDTH == 1
-#if PAGED_KV_CACHE_LAYOUT == 1
       HeadPtr<GMemCacheHead const, tokensPerPage, nbPagesPerVTile> const src{
-          cacheList.vCacheVLLM, pageIdx, nbKHeads, idxHeadBeg};
-#else
-      HeadPtr<GMemCacheHead const, tokensPerPage, nbPagesPerVTile> const src{
-          cacheList.pool, pageIdx, nbKHeads, idxHeadBeg};
-#endif
+          cacheList.vCacheVLLM, pageIdx,         tokenOffset,   idxHeadGrp,
+          kv_stride_page,       kv_stride_token, kv_stride_head};
 #else
       IndexedHeadPtr<GMemCacheHead const, tokensPerPage, nbPagesPerVTile> const src{
           /*indices=*/smem.gemm1CacheIndir[grpLoadV ? warpGrpIdx : warpIdx.x].data,
-#if PAGED_KV_CACHE_LAYOUT == 1
           /*pool=*/cacheList.vCacheVLLM,
-#else
-          /*pool=*/cacheList.pool,
-#endif
           /*pageIndices=*/smem.vCachePages[grpLoadV ? warpGrpIdx : warpIdx.x].data,
-          /*nbKHeads=*/nbKHeads,
-          /*offset=*/idxHeadBeg};
-#endif
-#else
-      uint32_t const idxHeadBeg = cacheVSeqBaseOffset + seqOffset;
-#if BEAM_WIDTH == 1
-      TinyPtr<GMemCacheHead const> const src{cacheList.data, idxHeadBeg};
-#else
-      IndexedHeadPtr<GMemCacheHead const, 0, 0> const src{
-          /*indices=*/smem.gemm1CacheIndir[grpLoadV ? warpGrpIdx : warpIdx.x].data,
-          /*pointer=*/cacheList.data,
-          /*offset=*/idxHeadBeg,
-          /*beamStride=*/cacheList.capacity * nbKHeads * 2};
-#endif
+          /*tokenOffset=*/tokenOffset,
+          /*headIdx=*/idxHeadGrp,
+          /*stride_page=*/kv_stride_page,
+          /*stride_token=*/kv_stride_token,
+          /*stride_head=*/kv_stride_head};
 #endif
       // if (threadIdx.x == dbgPrintTid) {
       //     printf("V: seqIter=%u, xIter=%u, idxBeam=%u, vIter=%u: pointers={%p, %p}, indices={",
@@ -1963,7 +1921,6 @@ CUBIN_EXPORT __global__
       unused(arrive<grpLoadV>(pWarpGrpBar));
       wait_parity<grpLoadV>(pWarpGrpBar, getAndFlip<grpLoadV>(warpGrpBarParityNext));
 #endif
-#if USE_PAGED_KV_CACHE
       constexpr uint32_t xIterSeqStride = cacheVTileSeqStride * nbVItersPerXIter;
       if constexpr (xIterSeqStride <= tokensPerPage) {
         uint32_t const nbXItersPerPage = exactDiv(tokensPerPage, xIterSeqStride);
@@ -1990,7 +1947,6 @@ CUBIN_EXPORT __global__
           loadPages(idxPageBeg);
         }
       }
-#endif
 #if BEAM_WIDTH > 1
       uint32_t seqIterNext, xIterNext, vIterNext, idxBeamNext;
       mha::tie(seqIterNext, xIterNext, vIterNext, idxBeamNext) =
@@ -2224,17 +2180,22 @@ CUBIN_EXPORT __global__
       }
     }
 
-    float voScale = (isKVCacheQuantized ? kvCacheScale[0] : 1.F);
+    float voScale = (isKVCacheQuantized ? kvCacheScaleValue : 1.F);
     if (seqIterInit < nbSeqIters) {  // otherwise rcpRowSum will be NAN.
       // The attention sinks are moved to the multi-block reduction part if the multi-block is
       // enabled.
       if (!isMultiBlock && attentionSinks != nullptr) {
         // Attention sinks are per head.
+#if SPEC_DEC
+        addAttentionSinksSpecDec(globalRowSum, globalRowMax,
+                                 attentionSinks + headGrpSize * idxHeadGrp, headGrpSize);
+#else
         addAttentionSinks(globalRowSum, globalRowMax, attentionSinks + headGrpSize * idxHeadGrp);
+#endif
       }
       ThrdRegRowMax const rcpRowSum = __frcp_rn(globalRowSum);
 #if LOW_PREC_OUTPUT
-      voScale *= rcpOutScale[0];
+      voScale *= rcpOutScale;
 #endif
       rescaleAcc(warp, acc, fullRescaleMask, rcpRowSum * ThrdRegRowMax::filled(voScale));
     }
@@ -2410,7 +2371,12 @@ CUBIN_EXPORT __global__
         }
         if (attentionSinks != nullptr) {
           // Attention sinks are per head.
+#if SPEC_DEC
+          addAttentionSinksSpecDec(mergedRowSum, mergedRowMax,
+                                   attentionSinks + headGrpSize * idxHeadGrp, headGrpSize);
+#else
           addAttentionSinks(mergedRowSum, mergedRowMax, attentionSinks + headGrpSize * idxHeadGrp);
+#endif
         }
         __syncthreads();
         rescaleAcc(warp, sumAcc, fullRescaleMask, __frcp_rn(mergedRowSum));
@@ -2462,10 +2428,10 @@ CUBIN_EXPORT __global__ __launch_bounds__(256, nbCtaPerSM) void kernel_mha(
 #if SLIDING_WINDOW
     uint32_t slidingWinSize,
 #endif
-    float qScale,
+    float qScale, float const* qScalePtr,
     OutputHead* __restrict__ const output,  // [nbReq][beamWidth][nbQHeads]
 #if LOW_PREC_OUTPUT
-    float const* rcpOutScale,
+    float rcpOutScale,
 #endif
     IOHead const* __restrict__ const q,  // [nbReq][beamWidth][nbQHeads],
 #if SPEC_DEC
@@ -2477,9 +2443,9 @@ CUBIN_EXPORT __global__ __launch_bounds__(256, nbCtaPerSM) void kernel_mha(
 #if BEAM_WIDTH > 1
     BeamSearchParams const beamSearchParams,
 #endif
-    uint32_t const batchSize,
-    float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                             // Used only for int8/fp8 KV cache.
+    uint32_t const batchSize, float kvCacheScale,
+    float const* kvScalePtr,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
+    uint32_t kv_stride_page, uint32_t kv_stride_token, uint32_t kv_stride_head,
     uint32_t* __restrict__ semaphores = nullptr, void* __restrict__ scratch = nullptr) {
 #if SPEC_DEC
   kernel_mha_impl(qSeqLen, nbKHeads, headGrpSize, qCuSeqLens,
@@ -2489,7 +2455,7 @@ CUBIN_EXPORT __global__ __launch_bounds__(256, nbCtaPerSM) void kernel_mha(
 #if SLIDING_WINDOW
                   slidingWinSize,
 #endif
-                  qScale, output,
+                  qScale, qScalePtr, output,
 #if LOW_PREC_OUTPUT
                   rcpOutScale,
 #endif
@@ -2501,7 +2467,8 @@ CUBIN_EXPORT __global__ __launch_bounds__(256, nbCtaPerSM) void kernel_mha(
 #if BEAM_WIDTH > 1
                   beamSearchParams,
 #endif
-                  batchSize, kvCacheScale, semaphores, scratch);
+                  batchSize, kvCacheScale, kvScalePtr, kv_stride_page, kv_stride_token,
+                  kv_stride_head, semaphores, scratch);
 }
 #else
 static constexpr auto kernel_mha = kernel_mha_impl;
@@ -2513,9 +2480,9 @@ void launchMHA(
 #if SLIDING_WINDOW
     uint32_t slidingWinSize,
 #endif
-    float qScale, OutputHead* output,
+    float qScale, float const* qScalePtr, OutputHead* output,
 #if LOW_PREC_OUTPUT
-    float const* rcpOutScale,
+    float rcpOutScale,
 #endif
 #if USE_INPUT_KV
     InputHead const* qkv,
@@ -2526,29 +2493,21 @@ void launchMHA(
     InputHead const* q,
 #endif
     float const* attentionSinks,  // [headGrpSize]
-#if USE_PAGED_KV_CACHE
-#if PAGED_KV_CACHE_LAYOUT == 1
     GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
-#else
-    GMemCacheHead* pool,  // global pool of pages
-#endif
     KVCachePageIndex const*
         kvCachePageList,  // device pointer. shape:
                           // KVCachePageIndex[batchSize][beamWidth][2][maxNbPagesPerSeq].
-#else
-    GMemKVCacheHead* kvCacheData,
-#endif
     uint32_t maxSeqLen, uint32_t const* seqLen,
 #if BEAM_WIDTH > 1
     BeamSearchParams const& beamSearchParams,
 #endif
-    uint32_t batchSize,
-    float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                             // Used only for int8/fp8 KV cache.
+    uint32_t batchSize, float kvCacheScale,
+    float const* kvScalePtr,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
 #if SPEC_DEC
     SpecDecParams const& specDecParams,
 #endif
-    uint32_t* semaphores, void* scratch, cudaStream_t stream) {
+    uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
+    uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream) {
 #if SPEC_DEC
   auto const qSeqLen = specDecParams.qSeqLen;
   auto const qCuSeqLens = specDecParams.qCuSeqLens;
@@ -2590,15 +2549,15 @@ void launchMHA(
   dim3 const dimGrid{nbSubSeqPerSeq, nbKHeads, batchSize};
 #endif
   dim3 const dimCta{warp_size * ctaShapeInWarps.x, ctaShapeInWarps.y, ctaShapeInWarps.z};
-  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, ENABLE_PDL != 0);
-#if USE_PAGED_KV_CACHE
+  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, enable_pdl);
   uint32_t const maxNbPagesPerSeq = exactDiv(maxSeqLen, tokensPerPage);
-#if PAGED_KV_CACHE_LAYOUT == 1
   KVCacheList<true> const cacheList{kCacheVLLM, vCacheVLLM, kvCachePageList, seqLen,
                                     maxNbPagesPerSeq};
-#else
-  KVCacheList<true> const cacheList{pool, kvCachePageList, seqLen, maxNbPagesPerSeq};
-#endif
+  // Convert stride from elements to Heads
+  uint32_t const stride_page_in_heads = static_cast<uint32_t>(kv_stride_page / validElemsPerHead);
+  uint32_t const stride_token_in_heads = static_cast<uint32_t>(kv_stride_token / validElemsPerHead);
+  uint32_t const stride_head_in_heads = static_cast<uint32_t>(kv_stride_head / validElemsPerHead);
+
   cudaLaunchKernelEx(&launchCfg, kernel_mha,
 #if SPEC_DEC
                      qSeqLen, nbKHeads, headGrpSize, qCuSeqLens,
@@ -2608,7 +2567,7 @@ void launchMHA(
 #if SLIDING_WINDOW
                      slidingWinSize,
 #endif
-                     qScale, output,
+                     qScale, qScalePtr, output,
 #if LOW_PREC_OUTPUT
                      rcpOutScale,
 #endif
@@ -2620,36 +2579,8 @@ void launchMHA(
 #if BEAM_WIDTH > 1
                      beamSearchParams,
 #endif
-                     batchSize, kvCacheScale, semaphores, scratch);
-#else
-  KVCacheList<false> const cacheList{kvCacheData, seqLen, maxSeqLen};
-#ifndef NDEBUG
-  kernel_mha<<<dimGrid, dimCta, hostSmemSize, stream>>>(
-#else
-  cudaLaunchKernelEx(&launchCfg, &kernel_mha,
-#endif
-#if SPEC_DEC
-      qSeqLen, nbKHeads, headGrpSize, qCuSeqLens,
-#else
-                     nbKHeads,
-#endif
-#if SLIDING_WINDOW
-      slidingWinSize,
-#endif
-      qScale, output,
-#if LOW_PREC_OUTPUT
-      rcpOutScale,
-#endif
-      q,
-#if SPEC_DEC
-      mask,
-#endif
-      attentionSinks, cacheList,
-#if BEAM_WIDTH > 1
-      beamSearchParams,
-#endif
-      batchSize, kvCacheScale, semaphores, scratch);
-#endif
+                     batchSize, kvCacheScale, kvScalePtr, stride_page_in_heads,
+                     stride_token_in_heads, stride_head_in_heads, semaphores, scratch);
   checkCuda(cudaPeekAtLastError());
 #endif  // USE_INPUT_KV
 }
@@ -2665,23 +2596,20 @@ static uint32_t configureKernel() {
 static uint32_t const hostSmemSize = configureKernel();
 
 void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32_t slidingWinSize,
-                         float qScale, OutputHead* output,
+                         float qScale, float const* qScalePtr, OutputHead* output,
 #if LOW_PREC_OUTPUT
-                         float const* rcpOutScale,
+                         float rcpOutScale,
 #endif
-                         InputHead const* q, float const* attentionSinks,
-#if PAGED_KV_CACHE_LAYOUT == 1
-                         GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
-#else
-                         GMemCacheHead* pool,
-#endif
-                         KVCachePageIndex const* kvCachePageList, uint32_t maxSeqLen,
-                         uint32_t const* seqLen, uint32_t batchSize,
-                         float const* __restrict__ kvCacheScale,
+                         InputHead const* q, float const* attentionSinks, GMemCacheHead* kCacheVLLM,
+                         GMemCacheHead* vCacheVLLM, KVCachePageIndex const* kvCachePageList,
+                         uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
+                         float kvCacheScale, float const* kvScalePtr,
 #if SPEC_DEC
                          uint32_t qSeqLen, uint32_t const* qCuSeqLens, MaskType const* mask,
 #endif
-                         uint32_t* semaphores, void* scratch, cudaStream_t stream) {
+                         uint32_t* semaphores, void* scratch, bool enable_pdl,
+                         uint64_t kv_stride_page, uint64_t kv_stride_token, uint64_t kv_stride_head,
+                         cudaStream_t stream) {
   uint32_t const nbSubSeqPerSeq = [&]() -> uint32_t {
     if (!allowMultiBlockMode) {
       return 1;
@@ -2696,15 +2624,15 @@ void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32
   dim3 const dimGrid{nbSubSeqPerSeq, nbKHeads, batchSize};
 #endif
   dim3 const dimCta{warp_size * ctaShapeInWarps.x, ctaShapeInWarps.y, ctaShapeInWarps.z};
-  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, ENABLE_PDL != 0);
-#if USE_PAGED_KV_CACHE
+  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, enable_pdl);
   uint32_t const maxNbPagesPerSeq = exactDiv(maxSeqLen, tokensPerPage);
-#if PAGED_KV_CACHE_LAYOUT == 1
   KVCacheList<true> const cacheList{kCacheVLLM, vCacheVLLM, kvCachePageList, seqLen,
                                     maxNbPagesPerSeq};
-#else
-  KVCacheList<true> const cacheList{pool, kvCachePageList, seqLen, maxNbPagesPerSeq};
-#endif
+  // Convert stride from elements to Heads
+  uint32_t const stride_page_in_heads = static_cast<uint32_t>(kv_stride_page / validElemsPerHead);
+  uint32_t const stride_token_in_heads = static_cast<uint32_t>(kv_stride_token / validElemsPerHead);
+  uint32_t const stride_head_in_heads = static_cast<uint32_t>(kv_stride_head / validElemsPerHead);
+
   cudaLaunchKernelEx(&launchCfg, kernel_mha,
 #if SPEC_DEC
                      qSeqLen, nbKHeads, headGrpSize, qCuSeqLens,
@@ -2714,7 +2642,7 @@ void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32
 #if SLIDING_WINDOW
                      slidingWinSize,
 #endif
-                     qScale, output,
+                     qScale, qScalePtr, output,
 #if LOW_PREC_OUTPUT
                      rcpOutScale,
 #endif
@@ -2722,32 +2650,9 @@ void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32
 #if SPEC_DEC
                      mask,
 #endif
-                     attentionSinks, cacheList, batchSize, kvCacheScale, semaphores, scratch);
-#else
-  KVCacheList<false> const cacheList{kvCacheData, seqLen, maxSeqLen};
-#ifndef NDEBUG
-  kernel_mha<<<dimGrid, dimCta, hostSmemSize, stream>>>(
-#else
-  cudaLaunchKernelEx(&launchCfg, &kernel_mha,
-#endif
-#if SPEC_DEC
-      qSeqLen, nbKHeads, headGrpSize, qCuSeqLens,
-#else
-                     nbKHeads,
-#endif
-#if SLIDING_WINDOW
-      slidingWinSize,
-#endif
-      qScale, output,
-#if LOW_PREC_OUTPUT
-      rcpOutScale,
-#endif
-      q,
-#if SPEC_DEC
-      mask,
-#endif
-      attentionSinks, cacheList, batchSize, kvCacheScale, semaphores, scratch);
-#endif
+                     attentionSinks, cacheList, batchSize, kvCacheScale, kvScalePtr,
+                     stride_page_in_heads, stride_token_in_heads, stride_head_in_heads, semaphores,
+                     scratch);
   checkCuda(cudaPeekAtLastError());
 }
 #endif
diff --git a/csrc/xqa/mha.h b/csrc/xqa/mha.h
index d50c081b6a..d7ab1c452c 100644
--- a/csrc/xqa/mha.h
+++ b/csrc/xqa/mha.h
@@ -1,13 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights
- * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #pragma once
@@ -46,7 +51,7 @@ constexpr bool useKVCache = USE_KV_CACHE;
 
 using SeqLenDataType = uint32_t;
 
-constexpr bool usePagedKVCache = USE_PAGED_KV_CACHE;
+constexpr bool usePagedKVCache = true;
 constexpr uint32_t tokensPerPage = TOKENS_PER_PAGE;
 
 using IOHead = Vec<InputElem, validElemsPerHead>;
@@ -93,9 +98,9 @@ void launchMHA(
 #if SLIDING_WINDOW
     uint32_t slidingWinSize,
 #endif
-    float qScale, OutputHead* output,
+    float qScale, float const* qScalePtr, OutputHead* output,
 #if LOW_PREC_OUTPUT
-    float const* rcpOutScale,
+    float rcpOutScale,
 #endif
 #if USE_INPUT_KV
     InputHead const* qkv,
@@ -106,57 +111,46 @@ void launchMHA(
     InputHead const* q,
 #endif
     float const* attentionSinks,  // [headGrpSize]
-#if USE_PAGED_KV_CACHE
-#if PAGED_KV_CACHE_LAYOUT == 1
     GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
-#else
-    GMemCacheHead* pool,  // global pool of pages
-#endif
     KVCachePageIndex const*
         kvCachePageList,  // device pointer. shape:
                           // KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq]
-#else
-    GMemKVCacheHead* kvCacheData,
-#endif
     uint32_t maxSeqLen, uint32_t const* seqLen,
 #if BEAM_WIDTH > 1
     BeamSearchParams const& beamSearchParams,
 #endif
-    uint32_t batchSize,
-    float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                             // Used only for int8/fp8 KV cache.
+    uint32_t batchSize, float kvCacheScale,
+    float const* kvScalePtr,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
 #if SPEC_DEC
     SpecDecParams const& specDecParams,
 #endif
-    uint32_t* semaphores, void* scratch, cudaStream_t stream);
+    uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
+    uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream);
 
 void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32_t slidingWinSize,
-                         float qScale, OutputHead* output,
+                         float qScale, float const* qScalePtr, OutputHead* output,
 #if LOW_PREC_OUTPUT
-                         float const* rcpOutScale,
+                         float rcpOutScale,
 #endif
-                         InputHead const* q, float const* attentionSinks,
-#if PAGED_KV_CACHE_LAYOUT == 1
-                         GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
-#else
-                         GMemCacheHead* pool,
-#endif
-                         KVCachePageIndex const* kvCachePageList, uint32_t maxSeqLen,
-                         uint32_t const* seqLen, uint32_t batchSize,
-                         float const* __restrict__ kvCacheScale,
+                         InputHead const* q, float const* attentionSinks, GMemCacheHead* kCacheVLLM,
+                         GMemCacheHead* vCacheVLLM, KVCachePageIndex const* kvCachePageList,
+                         uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
+                         float kvCacheScale, float const* kvScalePtr,
 #if SPEC_DEC
                          uint32_t qSeqLen, uint32_t const* qCuSeqLens, MaskType const* mask,
 #endif
-                         uint32_t* semaphores, void* scratch, cudaStream_t stream);
+                         uint32_t* semaphores, void* scratch, bool enable_pdl,
+                         uint64_t kv_stride_page, uint64_t kv_stride_token, uint64_t kv_stride_head,
+                         cudaStream_t stream);
 
 void launchHopperF8MHA(
     cudaDeviceProp const& prop, uint32_t nbKHeads,
 #if SLIDING_WINDOW
     uint32_t slidingWinSize,
 #endif
-    float qScale, OutputHead* output,
+    float qScale, float const* qScalePtr, OutputHead* output,
 #if LOW_PREC_OUTPUT
-    float const* rcpOutScale,
+    float rcpOutScale,
 #endif
 #if USE_INPUT_KV
     InputHead const* qkv,
@@ -167,88 +161,62 @@ void launchHopperF8MHA(
     InputHead const* q,
 #endif
     float const* attentionSinks,  // [headGrpSize]
-#if USE_PAGED_KV_CACHE
-#if PAGED_KV_CACHE_LAYOUT == 1
     GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
-#else
-    GMemCacheHead* pool,  // global pool of pages
-#endif
     KVCachePageIndex const*
         kvCachePageList,  // device pointer. shape:
                           // KVCachePageIndex[batchSize][beamWidth][2][maxNbPagesPerSeq].
-#else
-    GMemKVCacheHead* kvCacheData,
-#endif
     uint32_t maxSeqLen, uint32_t const* seqLen,
 #if BEAM_WIDTH > 1
     BeamSearchParams const& beamSearchParams,
 #endif
-    uint32_t batchSize,
-    float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                             // Used only for int8/fp8 KV cache.
+    uint32_t batchSize, float kvCacheScale,
+    float const* kvScalePtr,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
 #if SPEC_DEC
     SpecDecParams const& specDecParams,
 #endif
-    uint32_t* semaphores, void* scratch, cudaStream_t stream);
+    uint32_t* semaphores, void* scratch, bool enable_pdl, cudaStream_t stream);
 
-void launchHopperF8MHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads,
-                                 uint32_t slidingWinSize, float qScale, OutputHead* output,
+void launchHopperF8MHAFlashInfer(
+    uint32_t multiProcessorCount, uint32_t nbKHeads, uint32_t slidingWinSize, float qScale,
+    float const* qScalePtr, OutputHead* output,
 #if LOW_PREC_OUTPUT
-                                 float const* rcpOutScale,
+    float rcpOutScale,
 #endif
-                                 InputHead const* q, float const* attentionSinks,
-#if PAGED_KV_CACHE_LAYOUT == 1
-                                 GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
-#else
-                                 GMemCacheHead* pool,
-#endif
-                                 KVCachePageIndex const* kvCachePageList, uint32_t maxSeqLen,
-                                 uint32_t const* seqLen, uint32_t batchSize,
-                                 float const* __restrict__ kvCacheScale,
+    InputHead const* q, float const* attentionSinks, GMemCacheHead* kCacheVLLM,
+    GMemCacheHead* vCacheVLLM, KVCachePageIndex const* kvCachePageList, uint32_t maxSeqLen,
+    uint32_t const* seqLen, uint32_t batchSize, float kvCacheScale, float const* kvScalePtr,
 #if SPEC_DEC
-                                 uint32_t qSeqLen, uint32_t const* qCuSeqLens, MaskType const* mask,
+    uint32_t qSeqLen, uint32_t const* qCuSeqLens, MaskType const* mask,
 #endif
-                                 uint32_t* semaphores, void* scratch, cudaStream_t stream);
+    uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
+    uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream);
 
 void launchMLA(
     cudaDeviceProp const& prop,
     uint32_t inputSeqLen,  // uniform for all requests and causal mask is assumed
-    float qScale, OutputHead* output, InputHead const* q,
-#if USE_PAGED_KV_CACHE
-#if PAGED_KV_CACHE_LAYOUT == 1
+    float qScale, float const* qScalePtr, OutputHead* output, InputHead const* q,
     GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
-#else
-    GMemCacheHead* pool,  // global pool of pages
-#endif
     KVCachePageIndex const*
         kvCachePageList,  // device pointer. shape:
-                          // KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq] (Layout 0) or
-                          // [batchSize][maxNbPagesPerSeq] (Layout 1)
-#else
-    GMemKVCacheHead* kvCacheData,
-#endif
-    uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
-    float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                             // Used only for int8/fp8 KV cache.
-    uint32_t* semaphores, void* scratch, cudaStream_t stream);
+                          // KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq]
+                          // (Layout 0) or [batchSize][maxNbPagesPerSeq] (Layout 1)
+    uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize, float kvCacheScale,
+    float const* kvScalePtr,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
+    uint32_t* semaphores, void* scratch, bool enable_pdl, cudaStream_t stream);
 
 void launchMLAFlashInfer(
     uint32_t multiProcessorCount,
     uint32_t inputSeqLen,  // uniform for all requests and causal mask is assumed
-    float qScale, OutputHead* output, InputHead const* q,
-#if PAGED_KV_CACHE_LAYOUT == 1
+    float qScale, float const* qScalePtr, OutputHead* output, InputHead const* q,
     GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
-#else
-    GMemCacheHead* pool,  // global pool of pages
-#endif
     KVCachePageIndex const*
         kvCachePageList,  // device pointer. shape:
                           // KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq] (Layout 0) or
                           // [batchSize][maxNbPagesPerSeq] (Layout 1)
-    uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
-    float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                             // Used only for int8/fp8 KV cache.
-    uint32_t* semaphores, void* scratch, cudaStream_t stream);
+    uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize, float kvCacheScale,
+    float const* kvScalePtr,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
+    uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
+    uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream);
 
 #if STATIC_NB_K_HEADS
 constexpr uint32_t nbKHeads = NB_K_HEADS;
diff --git a/csrc/xqa/mhaUtils.cuh b/csrc/xqa/mhaUtils.cuh
index 5a4bf4f8f5..7fd53f9344 100644
--- a/csrc/xqa/mhaUtils.cuh
+++ b/csrc/xqa/mhaUtils.cuh
@@ -1,13 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights
- * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #pragma once
@@ -22,16 +27,21 @@ struct IndexedHeadPtrImpl {
   uint32_t const* indices;  // values are in range [0, beamWidth)
   Head* pool;
   Vec<KVCachePageIndex, nbPages> const* pageIndices;
-  uint32_t nbKHeads;
-  uint32_t offset;  // applied onto pool + pointers
+  uint32_t tokenOffset;   // token offset within the first page
+  uint32_t headIdx;       // head index
+  uint32_t stride_page;   // stride for each page (in units of Head)
+  uint32_t stride_token;  // stride for each token (in units of Head)
+  uint32_t stride_head;   // stride for each head (in units of Head)
 
   __device__ inline Head& operator[](uint32_t i) const { return *(*this + i); }
 
   __device__ inline Head* operator+(uint32_t i) const {
-    assert(indices[i] < beamWidth);
-    assert(nbPages == 1 || offset % tokensPerPage == 0);
-    auto const pageIdx = pageIndices[indices[i]][nbPages == 1 ? 0U : i / tokensPerPage];
-    return pool + (tokensPerPage * nbKHeads * pageIdx + offset + i % tokensPerPage);
+    uint32_t const beamIdx = indices[i];
+    assert(beamIdx < beamWidth);
+    uint32_t const absoluteTokenIdx = tokenOffset + i;
+    auto const pageIdx = pageIndices[beamIdx][nbPages == 1 ? 0U : absoluteTokenIdx / tokensPerPage];
+    return pool + pageIdx * stride_page + (absoluteTokenIdx % tokensPerPage) * stride_token +
+           headIdx * stride_head;
   }
 };
 
@@ -59,24 +69,21 @@ struct HeadPtr {
   static_assert(tokensPerPage != 0 && nbPages != 0);
   Head* pool;
   Vec<KVCachePageIndex, nbPages> pageIndices;
-  uint32_t nbKHeads;
-  uint32_t offset;  // offset inside the first page.
+  uint32_t tokenOffset;   // token offset within the first page
+  uint32_t headIdx;       // head index
+  uint32_t stride_page;   // stride for each page (in units of Head)
+  uint32_t stride_token;  // stride for each token (in units of Head)
+  uint32_t stride_head;   // stride for each head (in units of Head)
 
   __device__ inline Head& operator[](uint32_t i) const { return *(*this + i); }
 
   __device__ inline Head* operator+(uint32_t i) const {
-#if PAGED_KV_CACHE_LAYOUT == 1 && USE_PAGED_KV_CACHE
-    auto const pageIdx = pageIndices[nbPages == 1 ? 0U : i / tokensPerPage];
-    return (pageIdx & (1U << 31)) ? nullptr
-                                  : pool + (tokensPerPage * nbKHeads * pageIdx + offset +
-                                            (i % tokensPerPage) * nbKHeads);
-#else
-    assert(nbPages == 1 || offset % tokensPerPage == 0);
-    auto const pageIdx = pageIndices[nbPages == 1 ? 0U : i / tokensPerPage];
+    uint32_t const absoluteTokenIdx = tokenOffset + i;
+    auto const pageIdx = pageIndices[nbPages == 1 ? 0U : absoluteTokenIdx / tokensPerPage];
     return (pageIdx & (1U << 31))
                ? nullptr
-               : pool + (tokensPerPage * nbKHeads * pageIdx + offset + i % tokensPerPage);
-#endif
+               : pool + pageIdx * stride_page + (absoluteTokenIdx % tokensPerPage) * stride_token +
+                     headIdx * stride_head;
   }
 };
 
@@ -226,12 +233,8 @@ struct KVCacheList;
 
 template <>
 struct KVCacheList<true> {
-#if PAGED_KV_CACHE_LAYOUT == 1
   GMemCacheHead* kCacheVLLM;
   GMemCacheHead* vCacheVLLM;
-#else
-  GMemKVCacheHead* pool;
-#endif
   KVCachePageIndex const*
       kvCachePageList;  // shape: KVCachePageIndex[batchSize][beamWidth][2][maxNbPagesPerSeq].
   SeqLenDataType const* seqLenList;  // shape: [batchSize][beamWidth] (for compatibility)
@@ -279,16 +282,8 @@ __device__ inline Vec<KVCachePageIndex, nbLoadedPages> getPage(KVCacheList<true>
 #pragma unroll
   for (uint32_t i = 0; i < nbLoadedPages; i++) {
     uint32_t const idxPage = idxPageBeg + i;
-#if PAGED_KV_CACHE_LAYOUT == 1 && USE_PAGED_KV_CACHE
     ret[i] = (idxPage < nbPages ? cacheList.kvCachePageList[maxNbPagesPerSeq * idxReq + idxPage]
                                 : kBAD_PAGE_INDEX);
-#else
-    ret[i] =
-        (idxPage < nbPages ? cacheList.kvCachePageList[beamWidth * 2 * maxNbPagesPerSeq * idxReq +
-                                                       2 * maxNbPagesPerSeq * idxBeam +
-                                                       maxNbPagesPerSeq * (isK ? 0U : 1U) + idxPage]
-                           : kBAD_PAGE_INDEX);
-#endif
   }
   return ret;
 }
diff --git a/csrc/xqa/mha_sm90.cu b/csrc/xqa/mha_sm90.cu
index 0ba6dad585..a39b94cc21 100644
--- a/csrc/xqa/mha_sm90.cu
+++ b/csrc/xqa/mha_sm90.cu
@@ -1,13 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #include "cuda_hint.cuh"
@@ -201,11 +206,9 @@ struct alignas(128) SharedMem {
   ShmQWiseVec gemm1AccColMax;
   ShmQWiseVec gemm1AccColSum;
 
-#if USE_PAGED_KV_CACHE
   static constexpr uint32_t nbPagesPerTile =
       gemm0CtaTileNbTokens >= tokensPerPage ? exactDiv(gemm0CtaTileNbTokens, tokensPerPage) : 1;
   Vec<KVCachePageIndex, nbPagesPerTile> pages[2];  // one for K and one for V
-#endif
 
   // mem barriers
 
@@ -271,11 +274,9 @@ struct KVTilePartLoader {
   static constexpr uint32_t nbParts = cacheHeadNbParts;
   static constexpr uint32_t partElems = exactDiv(headElems, nbParts);
 
-#if USE_PAGED_KV_CACHE
   static_assert(gemm0CtaTileNbTokens % tokensPerPage == 0 ||
                 tokensPerPage % gemm0CtaTileNbTokens == 0);
   static constexpr uint32_t nbPagesPerTile = SharedMem::nbPagesPerTile;
-#endif
 
   uint32_t const nbKHeads;
   KVCacheList<usePagedKVCache> const& cacheList;
@@ -283,21 +284,15 @@ struct KVTilePartLoader {
   uint32_t const idxHeadGrp;
 
   CUtensorMap const& tensorMap;
-#if USE_PAGED_KV_CACHE
   uint32_t const nbPages;  // for bound check
   Vec<KVCachePageIndex, nbPagesPerTile>& pages;
   uint32_t idxTileRef;  // idxTile used to load the pages
-#endif
   uint32_t const baseOffset;
 
   __device__ KVTilePartLoader(bool isK, uint32_t nbKHeads,
                               KVCacheList<usePagedKVCache> const& cacheList, uint32_t idxReq,
-                              uint32_t idxHeadGrp, CUtensorMap const& tensorMap
-#if USE_PAGED_KV_CACHE
-                              ,
-                              uint32_t nbPages, Vec<KVCachePageIndex, nbPagesPerTile>& pageBuf
-#endif
-  );
+                              uint32_t idxHeadGrp, CUtensorMap const& tensorMap, uint32_t nbPages,
+                              Vec<KVCachePageIndex, nbPagesPerTile>& pageBuf);
   // tensorMap is for one whole page ([nbKHeads*tokensPerPage][headElems]) or whole cache
   template <uint32_t nbTokens, bool alignedForSwizzle>
   __device__ void loadData(
@@ -617,10 +612,10 @@ __launch_bounds__(128 * 3)
 #if SLIDING_WINDOW
         uint32_t const slidingWinSize,
 #endif
-        float const qScale,
+        float const qScale, float const* qScalePtr,
         OutputHead* __restrict__ const output,  // [nbReq][beamWidth][nbQHeads]
 #if LOW_PREC_OUTPUT
-        float const* const rcpOutScale,
+        float rcpOutScale,
 #endif
 #if USE_INPUT_KV
         IOHead const* __restrict__ const qkv,  // [nbReq][beamWidth][nbQHeads+nbKHeads+nbVHeads],
@@ -635,21 +630,18 @@ __launch_bounds__(128 * 3)
 #if USE_BEAM_SEARCH
         BeamSearchParams const beamSearchParams,
 #endif
-        uint32_t const batchSize,
-        float const* __restrict__ const kvCacheScale,  // Device memory scalar. Same scale for K and
-                                                       // V cache. Used only for int8/fp8 KV cache.
-#if PAGED_KV_CACHE_LAYOUT == 1
+        uint32_t const batchSize, float kvCacheScale,
+        float const* kvScalePtr,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
         __grid_constant__ CUtensorMap const tensorMapVLLMK,
         __grid_constant__ CUtensorMap const tensorMapVLLMV,
-#else
-            __grid_constant__ CUtensorMap const tensorMap,
-#endif
 #if SPEC_DEC
         SpecDecParams const specDecParams,
 #endif
         uint32_t* __restrict__ const semaphores =
             nullptr,  // [nbReq][nbKHeads][divUp(specDecParams.qSeqLen, inputTokensPerCta)]
         void* __restrict__ const scratch = nullptr) {
+  float const qScaleValue = qScalePtr != nullptr ? qScalePtr[0] : qScale;
+  float const kvCacheScaleValue = kvScalePtr != nullptr ? kvScalePtr[0] : kvCacheScale;
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL) && \
     (IS_SUPPORTED_F16_CASE || CACHE_ELEM_ENUM == 2) && BEAM_WIDTH == 1
   uint32_t const idxReq = blockIdx.z / nbKHeads;
@@ -733,16 +725,10 @@ __launch_bounds__(128 * 3)
   uint32_t const ctaInputTokBeg = reqInputTokBeg + ctaTokOffset;
   auto const warpIdx = getWarpIdx(uint3{128, 1, 3});
   auto const wid = warpIdx.z * 4 + warpIdx.x;
-#if PAGED_KV_CACHE_LAYOUT == 1
   if (wid == 0 && warpElectSync()) {
     tma::prefetchTensorMap(tensorMapVLLMK);
     tma::prefetchTensorMap(tensorMapVLLMV);
   }
-#else
-  if (wid == 0 && warpElectSync()) {
-    tma::prefetchTensorMap(tensorMap);
-  }
-#endif
   extern __shared__ char smemByteBuf[];
   assert(dynamicSmemSize() >= sizeof(SharedMem));
   SharedMem& smem = *reinterpret_cast<SharedMem*>(&smemByteBuf[0]);
@@ -768,9 +754,7 @@ __launch_bounds__(128 * 3)
   }
   __syncthreads();
 
-#if USE_PAGED_KV_CACHE
   uint32_t const nbPages = divUp(cacheSeqLen, tokensPerPage);
-#endif
 
   constexpr bool isKVCacheQuantized = (cacheElemSize < 2);
   assert(idxKTileInit < nbTiles);
@@ -795,7 +779,7 @@ __launch_bounds__(128 * 3)
     }
 
     float const qkScale =
-        qScale * (isKVCacheQuantized ? kvCacheScale[0] : 1.f) *
+        qScaleValue * (isKVCacheQuantized ? kvCacheScaleValue : 1.f) *
         rsqrtf(validElemsPerHead);  // qkScale is applied onto Q*K.T before softmax.
     uint32_t const warpRank = warpIdx.x;
 
@@ -980,11 +964,11 @@ __launch_bounds__(128 * 3)
 
     constexpr float xScale = 1.f / kE4M3_MAX;
 #if LOW_PREC_OUTPUT
-    float const oScale = rcpOutScale[0];
+    float const oScale = rcpOutScale;
 #else
     constexpr float oScale = 1.F;
 #endif
-    float const xvoScale = xScale * (isKVCacheQuantized ? kvCacheScale[0] : 1.f) * oScale;
+    float const xvoScale = xScale * (isKVCacheQuantized ? kvCacheScaleValue : 1.f) * oScale;
 
     Gemm1Acc acc{};  // init to zeros to avoid runtime checking for first gmma instruction.
     gmma::fence();
@@ -1316,18 +1300,8 @@ __launch_bounds__(128 * 3)
       asm volatile("fence.proxy.async.shared::cta;\n");
       unused(smem.qBar.produced.arrive());
     } else if (warpIdx.x == nbQLdWarps) {  // load k
-      KVTilePartLoader kTilePartLoader{true,           nbKHeads,     cacheList, idxReq, idxHeadGrp,
-#if USE_PAGED_KV_CACHE
-#if PAGED_KV_CACHE_LAYOUT == 1
-                                       tensorMapVLLMK,
-#else
-                                       tensorMap,
-#endif
-                                       nbPages,        smem.pages[0]
-#else
-                                       tensorMap
-#endif
-      };
+      KVTilePartLoader kTilePartLoader{true,       nbKHeads,       cacheList, idxReq,
+                                       idxHeadGrp, tensorMapVLLMK, nbPages,   smem.pages[0]};
       for (uint32_t idxIter = 0; idxIter < nbIters; idxIter++) {
         uint32_t const idxKTile = idxKTileInit + idxIter * nbSubSeq;
         kTilePartLoader.loadPages(idxKTile);
@@ -1348,7 +1322,7 @@ __launch_bounds__(128 * 3)
               headGrpSize * nbKHeads + idxHeadGrp + (headGrpSize + 2) * nbKHeads * idxReq;
           IOHead const& inKHead = qkv[inputKHeadOffset];
           uint32_t const lane = laneId();
-          float const rcpKScale = 1.F / kvCacheScale[0];
+          float const rcpKScale = 1.F / kvCacheScaleValue;
 #if ROPE_STYLE == 0
           constexpr bool isNeox = false;
           auto const pairs =
@@ -1385,18 +1359,8 @@ __launch_bounds__(128 * 3)
         }
       }
     } else if (warpIdx.x == nbQLdWarps + 1) {  // load v
-      KVTilePartLoader vTileLoader{false,          nbKHeads,     cacheList, idxReq, idxHeadGrp,
-#if USE_PAGED_KV_CACHE
-#if PAGED_KV_CACHE_LAYOUT == 1
-                                   tensorMapVLLMV,
-#else
-                                   tensorMap,
-#endif
-                                   nbPages,        smem.pages[1]
-#else
-                                   tensorMap
-#endif
-      };
+      KVTilePartLoader vTileLoader{false,      nbKHeads,       cacheList, idxReq,
+                                   idxHeadGrp, tensorMapVLLMV, nbPages,   smem.pages[1]};
       for (uint32_t idxIter = 0; idxIter < nbIters; idxIter++) {
         uint32_t const idxVTile = idxVTileInit + idxIter * nbSubSeq;
         vTileLoader.loadPages(idxVTile);
@@ -1417,7 +1381,7 @@ __launch_bounds__(128 * 3)
               (headGrpSize + 1) * nbKHeads + idxHeadGrp + (headGrpSize + 2) * nbKHeads * idxReq;
           IOHead const& inVHead = qkv[inputVHeadOffset];
           uint32_t const lane = laneId();
-          float const rcpVScale = 1.F / kvCacheScale[0];
+          float const rcpVScale = 1.F / kvCacheScaleValue;
           constexpr bool isNeox = false;
           auto const pairs =
               loadHead<InputElem, isNeox, warp_size, float>(inVHead, lane) * rcpVScale;
@@ -1730,35 +1694,16 @@ __device__ inline void F16QToF8Converter<nbThrds, beamWidth>::store(
 __device__ inline KVTilePartLoader::KVTilePartLoader(bool isK, uint32_t nbKHeads,
                                                      KVCacheList<usePagedKVCache> const& cacheList,
                                                      uint32_t idxReq, uint32_t idxHeadGrp,
-                                                     CUtensorMap const& tensorMap
-#if USE_PAGED_KV_CACHE
-                                                     ,
-                                                     uint32_t nbPages,
-                                                     Vec<KVCachePageIndex, nbPagesPerTile>& pageBuf
-#endif
-                                                     )
+                                                     CUtensorMap const& tensorMap, uint32_t nbPages,
+                                                     Vec<KVCachePageIndex, nbPagesPerTile>& pageBuf)
     : nbKHeads{nbKHeads},
       cacheList{cacheList},
       idxReq{idxReq},
       idxHeadGrp{idxHeadGrp},
-      tensorMap{tensorMap}
-#if USE_PAGED_KV_CACHE
-      ,
+      tensorMap{tensorMap},
       nbPages{nbPages},
-      pages{pageBuf}
-#if PAGED_KV_CACHE_LAYOUT == 1
-      ,
-      baseOffset{idxReq * cacheList.maxNbPagesPerSeq}
-#else
-      ,
-      baseOffset{((idxReq * beamWidth) * 2 + (isK ? 0 : 1)) * cacheList.maxNbPagesPerSeq}
-#endif
-#else
-      ,
-      baseOffset{(idxReq * beamWidth) * 2 + (isK ? 0 : 1)}
-#endif
-{
-}
+      pages{pageBuf},
+      baseOffset{idxReq * cacheList.maxNbPagesPerSeq} {}
 
 // tensorMap is for one whole page ([nbKHeads*tokensPerPage][headElems]) or whole cache
 template <uint32_t nbTokens, bool alignedForSwizzle>
@@ -1766,38 +1711,22 @@ __device__ inline void KVTilePartLoader::loadData(
     Array2D<LdGrain, nbTokens, exactDiv(cacheHeadPartBytes, grainBytes), alignedForSwizzle>& dst,
     uint32_t idxTile, uint32_t idxPart, CtaBarrier& bar) {
   static_assert(nbTokens == gemm0CtaTileNbTokens);
-#if USE_PAGED_KV_CACHE
   assert(idxTile == idxTileRef);
   if constexpr (nbTokens < tokensPerPage) {
     assert(nbPagesPerTile == 1);
     uint32_t const offset = nbTokens * (idxTile % exactDiv(tokensPerPage, nbTokens));
-#if PAGED_KV_CACHE_LAYOUT == 1
     tma::loadAsync(&dst, tensorMap,
                    DimsLE<4>{partElems * idxPart, idxHeadGrp, offset, (uint32_t)pages[0]}, bar);
-#else
-    tma::loadAsync(&dst, tensorMap,
-                   DimsLE<4>{partElems * idxPart, offset, idxHeadGrp, (uint32_t)pages[0]}, bar);
-#endif
   } else {
 #pragma unroll
     for (uint32_t i = 0; i < nbPagesPerTile; i++) {
-#if PAGED_KV_CACHE_LAYOUT == 1
       tma::loadAsync(&dst(tokensPerPage * i, 0), tensorMap,
                      DimsLE<4>{partElems * idxPart, idxHeadGrp, 0, (uint32_t)pages[i]}, bar);
-#else
-      tma::loadAsync(&dst(tokensPerPage * i, 0), tensorMap,
-                     DimsLE<4>{partElems * idxPart, 0, idxHeadGrp, (uint32_t)pages[i]}, bar);
-#endif
     }
   }
-#else
-  tma::loadAsync(&dst, tensorMap,
-                 DimsLE<4>{partElems * idxPart, nbTokens * idxTile, idxHeadGrp, baseOffset}, bar);
-#endif
 }
 
 __device__ inline void KVTilePartLoader::loadPages(uint32_t idxTile) {
-#if USE_PAGED_KV_CACHE
   uint32_t const idxPageBeg = gemm0CtaTileNbTokens >= tokensPerPage
                                   ? nbPagesPerTile * idxTile
                                   : idxTile / exactDiv(tokensPerPage, gemm0CtaTileNbTokens);
@@ -1812,28 +1741,13 @@ __device__ inline void KVTilePartLoader::loadPages(uint32_t idxTile) {
   }
   idxTileRef = idxTile;
   __syncwarp();
-#endif
 }
 
 __device__ inline GMemKVCacheHead& KVTilePartLoader::getHead(uint32_t pos) {
   constexpr uint32_t nbTokens = gemm0CtaTileNbTokens;
-#if USE_PAGED_KV_CACHE
-#if PAGED_KV_CACHE_LAYOUT == 1
   // Raise a runtime error indicating not implemented
-  assert(false && "KVTilePartLoader::getHead is not implemented for PAGED_KV_CACHE_LAYOUT == 1");
+  assert(false && "KVTilePartLoader::getHead is not implemented");
   __trap();
-#else
-  uint32_t const idxTile = pos / nbTokens;
-  assert(idxTile == idxTileRef);
-  uint32_t const offset = pos % tokensPerPage;
-  return cacheList
-      .pool[tokensPerPage * (nbKHeads * pages[pos % nbTokens / tokensPerPage] + idxHeadGrp) +
-            offset];
-#endif
-#else
-  // shape: KVCacheHead[batchSize][beamWidth][2][nbKHeads][capacity]
-  return cacheList.data[cacheList.capacity * (baseOffset * nbKHeads + idxHeadGrp) + pos];
-#endif
 }
 
 #if SWAP_AB
@@ -1966,9 +1880,12 @@ __device__ inline RegColWiseVec loadGmemColWiseVecWithDup(ShmQWiseVec const& gme
   for (uint32_t i = 0; i < exactDiv(ShmQWiseVec::size, gmma::instNBase); i++) {
     static_assert(nbThrdsPerInstNBase * RegColWiseVec::size ==
                   exactDiv(ShmQWiseVec::size, GmmaAccCoreMat::cols));
-    ret[i] = reinterpret_cast<Vec<Vec<float, GmmaAccCoreMat::cols>,
-                                  exactDiv(ShmQWiseVec::size, GmmaAccCoreMat::cols)> const&>(
-        gmemVec)[mha::min(i * nbThrdsPerInstNBase + idx, bound)];
+    uint32_t const clampedIdx = mha::min(i * nbThrdsPerInstNBase + idx, bound);
+    uint32_t const baseOffset = clampedIdx * GmmaAccCoreMat::cols;
+#pragma unroll
+    for (uint32_t j = 0; j < GmmaAccCoreMat::cols; j++) {
+      ret[i][j] = gmemVec[baseOffset + j];
+    }
   }
   return ret;
 }
@@ -2998,9 +2915,9 @@ void launchHopperF8MHA(
 #if SLIDING_WINDOW
     uint32_t slidingWinSize,
 #endif
-    float qScale, OutputHead* output,
+    float qScale, float const* qScalePtr, OutputHead* output,
 #if LOW_PREC_OUTPUT
-    float const* rcpOutScale,
+    float rcpOutScale,
 #endif
 #if USE_INPUT_KV
     InputHead const* qkv,
@@ -3011,29 +2928,21 @@ void launchHopperF8MHA(
     InputHead const* q,
 #endif
     float const* attentionSinks,  // [headGrpSize]
-#if USE_PAGED_KV_CACHE
-#if PAGED_KV_CACHE_LAYOUT == 1
     GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
-#else
-    GMemCacheHead* pool,  // global pool of pages
-#endif
     KVCachePageIndex const*
         kvCachePageList,  // device pointer. shape:
                           // KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq]
-#else
-    GMemKVCacheHead* kvCacheData,
-#endif
     uint32_t maxSeqLen, uint32_t const* seqLen,
 #if USE_BEAM_SEARCH
     BeamSearchParams const& beamSearchParams,
 #endif
-    uint32_t batchSize,
-    float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                             // Used only for int8/fp8 KV cache.
+    uint32_t batchSize, float kvCacheScale,
+    float const* kvScalePtr,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
 #if SPEC_DEC
     SpecDecParams const& specDecParams,
 #endif
-    uint32_t* semaphores, void* scratch, cudaStream_t stream) {
+    uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
+    uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream) {
   if (beamWidth != 1) {
     throw std::runtime_error("not implemented");
   }
@@ -3070,8 +2979,7 @@ void launchHopperF8MHA(
   // nbInputSeqSplit
   dim3 const dimGrid{divUp(qSeqLen, inputTokensPerCta), nbSubSeqPerSeq, nbKHeads * batchSize};
   dim3 const dimCta{warp_size * gmmaWarpsPerGrp, 1, 3};
-  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, ENABLE_PDL != 0);
-#if USE_PAGED_KV_CACHE
+  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, enable_pdl);
   uint32_t const maxNbPagesPerSeq = exactDiv(maxSeqLen, tokensPerPage);
   auto const dtype = [] {
     if (std::is_same_v<CacheElem, half>) {
@@ -3084,66 +2992,22 @@ void launchHopperF8MHA(
     throw std::runtime_error("unsupported cache element type");
   }();
 
-#if PAGED_KV_CACHE_LAYOUT == 1
   KVCacheList<true> const cacheList{kCacheVLLM, vCacheVLLM, kvCachePageList, seqLen,
                                     maxNbPagesPerSeq};
 
-  auto const tensorMapVLLMK =
-      makeTensorMapForPagedKVCache(kCacheVLLM, dtype, validElemsPerHead, nbKHeads, tokensPerPage,
-                                   cacheHeadPartElems, gemm0CtaTileNbTokens);
-  auto const tensorMapVLLMV =
-      makeTensorMapForPagedKVCache(vCacheVLLM, dtype, validElemsPerHead, nbKHeads, tokensPerPage,
-                                   cacheHeadPartElems, gemm0CtaTileNbTokens);
-#else
-  KVCacheList<true> const cacheList{pool, kvCachePageList, seqLen, maxNbPagesPerSeq};
-  auto const tensorMap =
-      makeTensorMapForPagedKVCache(pool, dtype, validElemsPerHead, nbKHeads, tokensPerPage,
-                                   cacheHeadPartElems, gemm0CtaTileNbTokens);
-#endif
+  auto const tensorMapVLLMK = makeTensorMapForPagedKVCache(
+      kCacheVLLM, dtype, validElemsPerHead, nbKHeads, tokensPerPage, cacheHeadPartElems,
+      gemm0CtaTileNbTokens, kv_stride_page, kv_stride_token, kv_stride_head);
+  auto const tensorMapVLLMV = makeTensorMapForPagedKVCache(
+      vCacheVLLM, dtype, validElemsPerHead, nbKHeads, tokensPerPage, cacheHeadPartElems,
+      gemm0CtaTileNbTokens, kv_stride_page, kv_stride_token, kv_stride_head);
 
-  cudaError_t const err = cudaLaunchKernelEx(&launchCfg, &kernel_mha, nbKHeads,
-#if SLIDING_WINDOW
-                                             slidingWinSize,
-#endif
-                                             qScale, output,
-#if LOW_PREC_OUTPUT
-                                             rcpOutScale,
-#endif
-#if USE_INPUT_KV
-                                             qkv,
-#if ROPE_STYLE != 0
-                                             ropeCosSin,
-#endif
-#else
-                                             q,
-#endif
-                                             attentionSinks, cacheList,
-#if USE_BEAM_SEARCH
-                                             beamSearchParams,
-#endif
-                                             batchSize, kvCacheScale,
-#if PAGED_KV_CACHE_LAYOUT == 1
-                                             tensorMapVLLMK, tensorMapVLLMV,
-#else
-                                             tensorMap,
-#endif
-#if SPEC_DEC
-                                             specDecParams,
-#endif
-                                             semaphores, scratch);
-#else
-  KVCacheList<false> const cacheList{kvCacheData, seqLen, maxSeqLen};
-  static_assert(!usePagedKVCache);
-  assert(gemm0CtaTileNbTokens == gemm1CtaTileNbTokens);
-  auto const tensorMap = makeTensorMapForContiguousKVCache(
-      kvCacheData, CU_TENSOR_MAP_DATA_TYPE_UINT8, validElemsPerHead, nbKHeads, maxSeqLen, beamWidth,
-      batchSize, cacheHeadPartElems, gemm0CtaTileNbTokens);
   cudaError_t const err =
-      cudaLaunchKernelEx(&launchCfg, kernel_mha, nbKHeads,
+      cudaLaunchKernelEx(&launchCfg, &kernel_mha, nbKHeads,
 #if SLIDING_WINDOW
                          slidingWinSize,
 #endif
-                         qScale, output,
+                         qScale, qScalePtr, output,
 #if LOW_PREC_OUTPUT
                          rcpOutScale,
 #endif
@@ -3159,8 +3023,11 @@ void launchHopperF8MHA(
 #if USE_BEAM_SEARCH
                          beamSearchParams,
 #endif
-                         batchSize, kvCacheScale, tensorMap, semaphores, scratch);
+                         batchSize, kvCacheScale, kvScalePtr, tensorMapVLLMK, tensorMapVLLMV,
+#if SPEC_DEC
+                         specDecParams,
 #endif
+                         semaphores, scratch);
   checkCuda(err);
 }
 #endif
@@ -3174,24 +3041,20 @@ static uint32_t configureKernel() {
 
 static uint32_t const hostSmemSize = configureKernel();
 
-void launchHopperF8MHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads,
-                                 uint32_t slidingWinSize, float qScale, OutputHead* output,
+void launchHopperF8MHAFlashInfer(
+    uint32_t multiProcessorCount, uint32_t nbKHeads, uint32_t slidingWinSize, float qScale,
+    float const* qScalePtr, OutputHead* output,
 #if LOW_PREC_OUTPUT
-                                 float const* rcpOutScale,
-#endif
-                                 InputHead const* q, float const* attentionSinks,
-#if PAGED_KV_CACHE_LAYOUT == 1
-                                 GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
-#else
-                                 GMemCacheHead* pool,
+    float rcpOutScale,
 #endif
-                                 KVCachePageIndex const* kvCachePageList, uint32_t maxSeqLen,
-                                 uint32_t const* seqLen, uint32_t batchSize,
-                                 float const* __restrict__ kvCacheScale,
+    InputHead const* q, float const* attentionSinks, GMemCacheHead* kCacheVLLM,
+    GMemCacheHead* vCacheVLLM, KVCachePageIndex const* kvCachePageList, uint32_t maxSeqLen,
+    uint32_t const* seqLen, uint32_t batchSize, float kvCacheScale, float const* kvScalePtr,
 #if SPEC_DEC
-                                 uint32_t qSeqLen, uint32_t const* qCuSeqLens, MaskType const* mask,
+    uint32_t qSeqLen, uint32_t const* qCuSeqLens, MaskType const* mask,
 #endif
-                                 uint32_t* semaphores, void* scratch, cudaStream_t stream) {
+    uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
+    uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream) {
   uint32_t const nbSubSeqPerSeq = [&]() -> uint32_t {
     float const factor = 0.25f;
     return mha::min<uint32_t>(
@@ -3207,8 +3070,7 @@ void launchHopperF8MHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads
 #endif
   dim3 const dimGrid{divUp(qLen, inputTokensPerCta), nbSubSeqPerSeq, nbKHeads * batchSize};
   dim3 const dimCta{warp_size * gmmaWarpsPerGrp, 1, 3};
-  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, ENABLE_PDL != 0);
-#if USE_PAGED_KV_CACHE
+  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, enable_pdl);
   uint32_t const maxNbPagesPerSeq = exactDiv(maxSeqLen, tokensPerPage);
   auto const dtype = [] {
     if (std::is_same_v<CacheElem, half>) {
@@ -3221,59 +3083,30 @@ void launchHopperF8MHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads
     throw std::runtime_error("unsupported cache element type");
   }();
 
-#if PAGED_KV_CACHE_LAYOUT == 1
   KVCacheList<true> const cacheList{kCacheVLLM, vCacheVLLM, kvCachePageList, seqLen,
                                     maxNbPagesPerSeq};
 
-  auto const tensorMapVLLMK =
-      makeTensorMapForPagedKVCache(kCacheVLLM, dtype, validElemsPerHead, nbKHeads, tokensPerPage,
-                                   cacheHeadPartElems, gemm0CtaTileNbTokens);
-  auto const tensorMapVLLMV =
-      makeTensorMapForPagedKVCache(vCacheVLLM, dtype, validElemsPerHead, nbKHeads, tokensPerPage,
-                                   cacheHeadPartElems, gemm0CtaTileNbTokens);
-#else
-  KVCacheList<true> const cacheList{pool, kvCachePageList, seqLen, maxNbPagesPerSeq};
-  auto const tensorMap =
-      makeTensorMapForPagedKVCache(pool, dtype, validElemsPerHead, nbKHeads, tokensPerPage,
-                                   cacheHeadPartElems, gemm0CtaTileNbTokens);
-#endif
+  auto const tensorMapVLLMK = makeTensorMapForPagedKVCache(
+      kCacheVLLM, dtype, validElemsPerHead, nbKHeads, tokensPerPage, cacheHeadPartElems,
+      gemm0CtaTileNbTokens, kv_stride_page, kv_stride_token, kv_stride_head);
+  auto const tensorMapVLLMV = makeTensorMapForPagedKVCache(
+      vCacheVLLM, dtype, validElemsPerHead, nbKHeads, tokensPerPage, cacheHeadPartElems,
+      gemm0CtaTileNbTokens, kv_stride_page, kv_stride_token, kv_stride_head);
 
   cudaError_t const err = cudaLaunchKernelEx(&launchCfg, &kernel_mha, nbKHeads,
 #if SLIDING_WINDOW
                                              slidingWinSize,
 #endif
-                                             qScale, output,
+                                             qScale, qScalePtr, output,
 #if LOW_PREC_OUTPUT
                                              rcpOutScale,
 #endif
                                              q, attentionSinks, cacheList, batchSize, kvCacheScale,
-#if PAGED_KV_CACHE_LAYOUT == 1
-                                             tensorMapVLLMK, tensorMapVLLMV,
-#else
-                                             tensorMap,
-#endif
+                                             kvScalePtr, tensorMapVLLMK, tensorMapVLLMV,
 #if SPEC_DEC
                                              specDecParams,
 #endif
                                              semaphores, scratch);
-#else
-  KVCacheList<false> const cacheList{kvCacheData, seqLen, maxSeqLen};
-  static_assert(!usePagedKVCache);
-  assert(gemm0CtaTileNbTokens == gemm1CtaTileNbTokens);
-  auto const tensorMap = makeTensorMapForContiguousKVCache(
-      kvCacheData, CU_TENSOR_MAP_DATA_TYPE_UINT8, validElemsPerHead, nbKHeads, maxSeqLen, beamWidth,
-      batchSize, cacheHeadPartElems, gemm0CtaTileNbTokens);
-  cudaError_t const err = cudaLaunchKernelEx(&launchCfg, kernel_mha, nbKHeads,
-#if SLIDING_WINDOW
-                                             slidingWinSize,
-#endif
-                                             qScale, output,
-#if LOW_PREC_OUTPUT
-                                             rcpOutScale,
-#endif
-                                             q, attentionSinks, cacheList, batchSize, kvCacheScale,
-                                             tensorMap, semaphores, scratch);
-#endif
   checkCuda(err);
 }
 #endif
diff --git a/csrc/xqa/mha_stdheaders.cuh b/csrc/xqa/mha_stdheaders.cuh
index c76e759c32..353a8bbd9c 100644
--- a/csrc/xqa/mha_stdheaders.cuh
+++ b/csrc/xqa/mha_stdheaders.cuh
@@ -1,13 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #pragma once
diff --git a/csrc/xqa/mla_sm120.cu b/csrc/xqa/mla_sm120.cu
index 088f601015..fc77535bfd 100644
--- a/csrc/xqa/mla_sm120.cu
+++ b/csrc/xqa/mla_sm120.cu
@@ -1,13 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #include "defines.h"
@@ -64,11 +69,9 @@ inline constexpr uint32_t nbRegsForMathWarps = 232;
 inline constexpr bool computeRowSumFromF8 = true;
 
 struct KVTilePartLoader {
-#if USE_PAGED_KV_CACHE
   static_assert(tokensPerPage % tokensPerTile == 0 || tokensPerTile % tokensPerPage == 0);
   static inline constexpr uint32_t nbPagesPerTile =
       tokensPerTile >= tokensPerPage ? exactDiv(tokensPerTile, tokensPerPage) : 1;
-#endif
 
   static inline constexpr uint32_t const nbKHeads = 1;
   KVCacheList<usePagedKVCache> const& cacheList;
@@ -78,20 +81,13 @@ struct KVTilePartLoader {
   CUtensorMap const& tensorMap;
   // if greater than 1, then we need unrolling for the loading loop. Seems 1 is fine for latency.
   static inline constexpr uint32_t nbPageBuffers = 1;
-#if USE_PAGED_KV_CACHE
   uint32_t const nbPages;  // for bound check
   Vec<KVCachePageIndex, nbPagesPerTile> pageBuffers[nbPageBuffers];
   uint32_t idxTileRef = ~0U;  // idxTile used to load the pages
-#endif
   uint32_t const baseOffset;
 
   __device__ KVTilePartLoader(KVCacheList<usePagedKVCache> const& cacheList, uint32_t idxReq,
-                              CUtensorMap const& tensorMap
-#if USE_PAGED_KV_CACHE
-                              ,
-                              uint32_t nbPages
-#endif
-  );
+                              CUtensorMap const& tensorMap, uint32_t nbPages);
   // tensorMap is for one whole page ([nbKHeads*tokensPerPage][headElems]) or whole cache
   template <uint32_t nbTokens, uint32_t grainsPerPart, bool alignedForSwizzle>
   __device__ void loadData(Array2D<LdGrain, nbTokens, grainsPerPart, alignedForSwizzle>& dst,
@@ -102,30 +98,13 @@ struct KVTilePartLoader {
 };
 
 __device__ inline KVTilePartLoader::KVTilePartLoader(KVCacheList<usePagedKVCache> const& cacheList,
-                                                     uint32_t idxReq, CUtensorMap const& tensorMap
-#if USE_PAGED_KV_CACHE
-                                                     ,
-                                                     uint32_t nbPages
-#endif
-                                                     )
+                                                     uint32_t idxReq, CUtensorMap const& tensorMap,
+                                                     uint32_t nbPages)
     : cacheList{cacheList},
       idxReq{idxReq},
-      tensorMap{tensorMap}
-#if USE_PAGED_KV_CACHE
-      ,
-      nbPages{nbPages}
-#if PAGED_KV_CACHE_LAYOUT == 1
-      ,
-      baseOffset{idxReq * cacheList.maxNbPagesPerSeq}
-#else
-      ,
-      baseOffset{((idxReq * beamWidth) * 2) * cacheList.maxNbPagesPerSeq}
-#endif
-#else
-      ,
-      baseOffset{(idxReq * beamWidth) * 2}
-#endif
-{
+      tensorMap{tensorMap},
+      nbPages{nbPages},
+      baseOffset{idxReq * cacheList.maxNbPagesPerSeq} {
 #pragma unroll
   for (auto& pageBuffer : pageBuffers) {
     pageBuffer.fill(kBAD_PAGE_INDEX);
@@ -138,45 +117,27 @@ __device__ inline void KVTilePartLoader::loadData(
     Array2D<LdGrain, nbTokens, grainsPerPart, alignedForSwizzle>& dst, uint32_t idxTile,
     uint32_t idxElemBeg, CtaBarrier& bar, uint32_t idxPageBuf) {
   static_assert(nbTokens == tokensPerTile);
-#if USE_PAGED_KV_CACHE
   assert(idxTile == idxTileRef);
   auto const& pages = pageBuffers[idxPageBuf];
   if constexpr (nbTokens < tokensPerPage) {
     assert(nbPagesPerTile == 1);
     uint32_t const offset = nbTokens * (idxTile % exactDiv(tokensPerPage, nbTokens));
     if (warpElectSync()) {
-#if PAGED_KV_CACHE_LAYOUT == 1
       tma::loadAsync(&dst, tensorMap, DimsLE<4>{idxElemBeg, idxHeadGrp, offset, (uint32_t)pages[0]},
                      bar);
-#else
-      tma::loadAsync(&dst, tensorMap, DimsLE<4>{idxElemBeg, offset, idxHeadGrp, (uint32_t)pages[0]},
-                     bar);
-#endif
     }
   } else {
 #pragma unroll
     for (uint32_t i = 0; i < nbPagesPerTile; i++) {
       if (warpElectSync()) {
-#if PAGED_KV_CACHE_LAYOUT == 1
         tma::loadAsync(&dst(tokensPerPage * i, 0), tensorMap,
                        DimsLE<4>{idxElemBeg, idxHeadGrp, 0, (uint32_t)pages[i]}, bar);
-#else
-        tma::loadAsync(&dst(tokensPerPage * i, 0), tensorMap,
-                       DimsLE<4>{idxElemBeg, 0, idxHeadGrp, (uint32_t)pages[i]}, bar);
-#endif
       }
     }
   }
-#else
-  if (warpElectSync()) {
-    tma::loadAsync(&dst, tensorMap,
-                   DimsLE<4>{idxElemBeg, nbTokens * idxTile, idxHeadGrp, baseOffset}, bar);
-  }
-#endif
 }
 
 __device__ inline void KVTilePartLoader::loadPages(uint32_t idxTile, uint32_t idxPageBuf) {
-#if USE_PAGED_KV_CACHE
   uint32_t const idxPageBeg = tokensPerTile >= tokensPerPage
                                   ? nbPagesPerTile * idxTile
                                   : idxTile / exactDiv(tokensPerPage, tokensPerTile);
@@ -188,7 +149,6 @@ __device__ inline void KVTilePartLoader::loadPages(uint32_t idxTile, uint32_t id
         idxPage < nbPages ? cacheList.kvCachePageList[baseOffset + idxPage] : kBAD_PAGE_INDEX;
   }
   idxTileRef = idxTile;
-#endif
 }
 
 using Mat16x32 = Vec<uint32_t, 4>;
@@ -440,8 +400,7 @@ struct KernelArgs {
   OutputHead* __restrict__ const& output;  // [totalNbIntputTokens][nbQHeads]
   KVCacheList<usePagedKVCache> const& cacheList;
   uint32_t const& batchSize;
-  float const* __restrict__ const& kvCacheScale;  // Device memory scalar. Same scale for K and V
-                                                  // cache. Used only for int8/fp8 KV cache.
+  float kvCacheScale;  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
   Vec<CgaXBuffer, nbProducerCtasPerCga>* __restrict__ const&
       cgaXBuf;                                        // [totalNbInputTokens][maxNbSubSeq]
   uint32_t* __restrict__ const& semaphores;           // [totalNbInputTokens]
@@ -494,7 +453,7 @@ struct Producer {
     __syncthreads();
 #endif
     if (threadIdx.x == 0) {
-      smem.qkScaleLog2e = args.qScale * args.kvCacheScale[0] * log2e;
+      smem.qkScaleLog2e = args.qScale * args.kvCacheScale * log2e;
     }
 
     if (threadIdx.x < headGrpSize) {
@@ -860,12 +819,7 @@ struct Producer {
 };
 
 __device__ inline void Producer::loadK() {
-  KVTilePartLoader loader{args.cacheList, idxReq, args.tensorMapK
-#if USE_PAGED_KV_CACHE
-                          ,
-                          divUp(seqLen, tokensPerPage)
-#endif
-  };
+  KVTilePartLoader loader{args.cacheList, idxReq, args.tensorMapK, divUp(seqLen, tokensPerPage)};
 
 #pragma unroll 1
   for (uint32_t iter = 0; true; iter++) {
@@ -1278,7 +1232,7 @@ __device__ inline void Consumer::compute() {
 
   ThrdRegRowMax const accRowSum =
       loadShmRowMax<warpTile.y>(smem.accRowSum[tileIdx.x], tileBase.y, lane);
-  float const xvScale = computeRowSumFromF8 ? args.kvCacheScale[0] : args.kvCacheScale[0] * xScale;
+  float const xvScale = computeRowSumFromF8 ? args.kvCacheScale : args.kvCacheScale * xScale;
   WarpOutputTile const output = finalize(acc, accRowSum, xvScale, lane);
 
   bool const isMultiBlockMode = (nbSubSeq != 1);
@@ -1340,12 +1294,7 @@ __device__ inline void Consumer::loadX() {
 }
 
 __device__ inline void Consumer::loadV() {
-  KVTilePartLoader loader(args.cacheList, idxReq, args.tensorMapV
-#if USE_PAGED_KV_CACHE
-                          ,
-                          divUp(seqLen, tokensPerPage)
-#endif
-  );
+  KVTilePartLoader loader(args.cacheList, idxReq, args.tensorMapV, divUp(seqLen, tokensPerPage));
   for (uint32_t iter = 0; true; iter++) {
     uint32_t const idxTile = iterToTile(iter);
     if (idxTile >= nbTiles()) {
@@ -1605,17 +1554,18 @@ __launch_bounds__(32 * 4 * 3, 1) __cluster_dims__(cgaSize, 1, 1) void kernel_mha
     __grid_constant__ CUtensorMap const tensorMapQ,  // MhaIOHead[nbQHeads * totalNbInputTokens],
     __grid_constant__ CUtensorMap const tensorMapK,  // with box=64 for the least significant dim
     __grid_constant__ CUtensorMap const tensorMapV,  // with box=128 for the least significant dim
-    float const qScale,
+    float const qScale, float const* qScalePtr,
     OutputHead* __restrict__ const output,  // [totalNbIntputTokens][nbQHeads]
-    KVCacheList<usePagedKVCache> const cacheList, uint32_t const batchSize,
-    float const* __restrict__ const kvCacheScale,  // Device memory scalar. Same scale for K and V
-                                                   // cache. Used only for int8/fp8 KV cache.
+    KVCacheList<usePagedKVCache> const cacheList, uint32_t const batchSize, float kvCacheScale,
+    float const* kvScalePtr,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
     Vec<CgaXBuffer,
         nbProducerCtasPerCga>* __restrict__ const cgaXBuf,  // [totalNbInputTokens][maxNbSubSeq]
     uint32_t* __restrict__ const semaphores = nullptr,      // [totalNbInputTokens]
     PartialResult* __restrict__ const partialResults =
         nullptr)  // [totalNbInputTokens][maxNbSubSeq]
 {
+  float const qScaleValue = qScalePtr != nullptr ? qScalePtr[0] : qScale;
+  float const kvCacheScaleValue = kvScalePtr != nullptr ? kvScalePtr[0] : kvCacheScale;
   assert(blockDim.x == 32 * 12 && blockDim.y == 1 && blockDim.z == 1);
   extern __shared__ char smemBuf[];
   uint32_t const warpRank = makeWarpUniform(this_warp(), threadIdx.x / warp_size);
@@ -1646,8 +1596,9 @@ __launch_bounds__(32 * 4 * 3, 1) __cluster_dims__(cgaSize, 1, 1) void kernel_mha
   uint32_t const ctaRank = clusterCtaRank();
   bool const isProducer = (ctaRank < nbProducerCtasPerCga);
 
-  KernelArgs const args{tensorMapQ, tensorMapK,   tensorMapV, qScale,     output,        cacheList,
-                        batchSize,  kvCacheScale, cgaXBuf,    semaphores, partialResults};
+  KernelArgs const args{tensorMapQ, tensorMapK, tensorMapV,    qScaleValue,
+                        output,     cacheList,  batchSize,     kvCacheScaleValue,
+                        cgaXBuf,    semaphores, partialResults};
 
   if (isProducer) {
     Producer{args,
@@ -1706,25 +1657,15 @@ CUtensorMap makeTensorMapForQ(void const* addr, CUtensorMapDataType_enum dataTyp
 void launchMLA(
     cudaDeviceProp const& prop,
     uint32_t inputSeqLen,  // uniform for all requests and causal mask is assumed
-    float qScale, OutputHead* output, InputHead const* q,
-#if USE_PAGED_KV_CACHE
-#if PAGED_KV_CACHE_LAYOUT == 1
-    GMemCacheHead* kCacheVLLM,  // K cache pool for VLLM layout
-    GMemCacheHead* vCacheVLLM,  // V cache pool for VLLM layout
-#else
-    GMemCacheHead* pool,  // global pool of pages
-#endif
-    KVCachePageIndex const*
-        kvCachePageList,  // device pointer. shape:
-                          // KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq] (Layout 0) or
-                          // [batchSize][maxNbPagesPerSeq] (Layout 1)
-#else
-    GMemKVCacheHead* kvCacheData,
-#endif
-    uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
-    float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                             // Used only for int8/fp8 KV cache.
-    uint32_t* semaphores, void* scratch, cudaStream_t stream) {
+    float qScale, float const* qScalePtr, OutputHead* output, InputHead const* q,
+    GMemCacheHead* kCacheVLLM,                // K cache pool for VLLM layout
+    GMemCacheHead* vCacheVLLM,                // V cache pool for VLLM layout
+    KVCachePageIndex const* kvCachePageList,  // device pointer. shape:
+                                              // [batchSize][maxNbPagesPerSeq] (Layout 1)
+    uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize, float kvCacheScale,
+    float const* kvScalePtr,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
+    uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
+    uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream) {
 #if IS_MLA
   static_assert(
       SLIDING_WINDOW == 0 && LOW_PREC_OUTPUT == 0 && USE_INPUT_KV == 0 && USE_BEAM_SEARCH == 0,
@@ -1762,15 +1703,10 @@ void launchMLA(
   // nbInputSeqSplit
   dim3 const dimGrid{4 * inputSeqLen, nbSubSeqPerSeq, nbKHeads * batchSize};
   dim3 const dimCta{warp_size * 4 * 3, 1, 1};
-  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, ENABLE_PDL != 0);
-#if USE_PAGED_KV_CACHE
+  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, enable_pdl);
   uint32_t const maxNbPagesPerSeq = exactDiv(maxSeqLen, tokensPerPage);
-#if PAGED_KV_CACHE_LAYOUT == 1
   KVCacheList<true> const cacheList{kCacheVLLM, vCacheVLLM, kvCachePageList, seqLen,
                                     maxNbPagesPerSeq};
-#else
-  KVCacheList<true> const cacheList{pool, kvCachePageList, seqLen, maxNbPagesPerSeq};
-#endif
   auto const dtype = [] {
     if (std::is_same_v<CacheElem, half>) {
       return CU_TENSOR_MAP_DATA_TYPE_FLOAT16;
@@ -1784,24 +1720,19 @@ void launchMLA(
 
   auto const tensorMapQ = makeTensorMapForQ(q, dtype, validElemsPerHead,
                                             headGrpSize * inputSeqLen * batchSize, partElemsK);
-#if PAGED_KV_CACHE_LAYOUT == 1
   auto const tensorMapK = makeTensorMapForPagedKVCache(
-      kCacheVLLM, dtype, validElemsPerHead, nbKHeads, tokensPerPage, partElemsK, tokensPerTile);
+      kCacheVLLM, dtype, validElemsPerHead, nbKHeads, tokensPerPage, partElemsK, tokensPerTile,
+      kv_stride_page, kv_stride_token, kv_stride_head);
   auto const tensorMapV = makeTensorMapForPagedKVCache(
-      vCacheVLLM, dtype, validElemsPerHead, nbKHeads, tokensPerPage, partElemsV, tokensPerTile);
-#else
-  auto const tensorMapK = makeTensorMapForPagedKVCache(pool, dtype, validElemsPerHead, nbKHeads,
-                                                       tokensPerPage, partElemsK, tokensPerTile);
-  auto const tensorMapV = makeTensorMapForPagedKVCache(pool, dtype, validElemsPerHead, nbKHeads,
-                                                       tokensPerPage, partElemsV, tokensPerTile);
-#endif
+      vCacheVLLM, dtype, validElemsPerHead, nbKHeads, tokensPerPage, partElemsV, tokensPerTile,
+      kv_stride_page, kv_stride_token, kv_stride_head);
 
   uint32_t const nbCgas = exactDiv(dimGrid.x, 4) * dimGrid.y * dimGrid.z;
   auto const cgaXBuf = static_cast<Vec<CgaXBuffer, nbProducerCtasPerCga>*>(scratch);
   auto const partialResults = reinterpret_cast<PartialResult*>(cgaXBuf + nbCgas);
-  cudaError_t const err = cudaLaunchKernelEx(&launchCfg, &kernel_mha, tensorMapQ, tensorMapK,
-                                             tensorMapV, qScale, output, cacheList, batchSize,
-                                             kvCacheScale, cgaXBuf, semaphores, partialResults);
+  cudaError_t const err = cudaLaunchKernelEx(
+      &launchCfg, &kernel_mha, tensorMapQ, tensorMapK, tensorMapV, qScale, qScalePtr, output,
+      cacheList, batchSize, kvCacheScale, kvScalePtr, cgaXBuf, semaphores, partialResults);
 #else
   KVCacheList<false> const cacheList{kvCacheData, seqLen, maxSeqLen};
   static_assert(!usePagedKVCache);
@@ -1847,21 +1778,15 @@ static uint32_t const hostSmemSize = configureKernel();
 void launchMLAFlashInfer(
     uint32_t multiProcessorCount,
     uint32_t inputSeqLen,  // uniform for all requests and causal mask is assumed
-    float qScale, OutputHead* output, InputHead const* q,
-#if PAGED_KV_CACHE_LAYOUT == 1
-    GMemCacheHead* kCacheVLLM,  // K cache pool for VLLM layout
-    GMemCacheHead* vCacheVLLM,  // V cache pool for VLLM layout
-#else
-    GMemCacheHead* pool,  // global pool of pages
-#endif
-    KVCachePageIndex const*
-        kvCachePageList,  // device pointer. shape:
-                          // KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq] (Layout 0) or
-                          // [batchSize][maxNbPagesPerSeq] (Layout 1)
-    uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize,
-    float const* __restrict__ kvCacheScale,  // Device memory scalar. Same scale for K and V cache.
-                                             // Used only for int8/fp8 KV cache.
-    uint32_t* semaphores, void* scratch, cudaStream_t stream) {
+    float qScale, float const* qScalePtr, OutputHead* output, InputHead const* q,
+    GMemCacheHead* kCacheVLLM,                // K cache pool for VLLM layout
+    GMemCacheHead* vCacheVLLM,                // V cache pool for VLLM layout
+    KVCachePageIndex const* kvCachePageList,  // device pointer. shape:
+                                              // [batchSize][maxNbPagesPerSeq] (Layout 1)
+    uint32_t maxSeqLen, uint32_t const* seqLen, uint32_t batchSize, float kvCacheScale,
+    float const* kvScalePtr,  // Same scale for K and V cache. Used only for int8/fp8 KV cache.
+    uint32_t* semaphores, void* scratch, bool enable_pdl, uint64_t kv_stride_page,
+    uint64_t kv_stride_token, uint64_t kv_stride_head, cudaStream_t stream) {
 #if IS_MLA
   static_assert(
       SLIDING_WINDOW == 0 && LOW_PREC_OUTPUT == 0 && USE_INPUT_KV == 0 && USE_BEAM_SEARCH == 0,
@@ -1873,27 +1798,22 @@ void launchMLAFlashInfer(
   uint32_t const nbVHeads = nbKHeads;
   uint32_t const nbQHeads = nbKHeads * headGrpSize;
   uint32_t const nbQKVHeads = nbQHeads + nbKHeads + nbVHeads;
-  uint32_t const nbSubSeqPerSeq = [&]() -> uint32_t {
+  /*uint32_t const nbSubSeqPerSeq = [&]() -> uint32_t {
     float const factor = 4.f;
     return mha::min<uint32_t>(
         mha::max<uint32_t>(
             1U, (uint32_t)round(multiProcessorCount / 4 / (batchSize * nbKHeads) * factor)),
         divUp(maxSeqLen, tokensPerTile * 2));
-  }();
+  }();*/ // MLA disables multi-block mode for now
   // printf("nbSubSeqPerSeq = %u\n", nbSubSeqPerSeq);
   // gridDim.z == nbKHeads * batchSize && gridDim.y == nbSubSeqPerSeq && gridDim.x ==
   // nbInputSeqSplit
-  dim3 const dimGrid{4 * inputSeqLen, nbSubSeqPerSeq, nbKHeads * batchSize};
+  dim3 const dimGrid{4 * inputSeqLen, 1, nbKHeads * batchSize};
   dim3 const dimCta{warp_size * 4 * 3, 1, 1};
-  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, ENABLE_PDL != 0);
-#if USE_PAGED_KV_CACHE
+  auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, enable_pdl);
   uint32_t const maxNbPagesPerSeq = exactDiv(maxSeqLen, tokensPerPage);
-#if PAGED_KV_CACHE_LAYOUT == 1
   KVCacheList<true> const cacheList{kCacheVLLM, vCacheVLLM, kvCachePageList, seqLen,
                                     maxNbPagesPerSeq};
-#else
-  KVCacheList<true> const cacheList{pool, kvCachePageList, seqLen, maxNbPagesPerSeq};
-#endif
   auto const dtype = [] {
     if (std::is_same_v<CacheElem, half>) {
       return CU_TENSOR_MAP_DATA_TYPE_FLOAT16;
@@ -1907,54 +1827,19 @@ void launchMLAFlashInfer(
 
   auto const tensorMapQ = makeTensorMapForQ(q, dtype, validElemsPerHead,
                                             headGrpSize * inputSeqLen * batchSize, partElemsK);
-#if PAGED_KV_CACHE_LAYOUT == 1
   auto const tensorMapK = makeTensorMapForPagedKVCache(
-      kCacheVLLM, dtype, validElemsPerHead, nbKHeads, tokensPerPage, partElemsK, tokensPerTile);
+      kCacheVLLM, dtype, validElemsPerHead, nbKHeads, tokensPerPage, partElemsK, tokensPerTile,
+      kv_stride_page, kv_stride_token, kv_stride_head);
   auto const tensorMapV = makeTensorMapForPagedKVCache(
-      vCacheVLLM, dtype, validElemsPerHead, nbKHeads, tokensPerPage, partElemsV, tokensPerTile);
-#else
-  auto const tensorMapK = makeTensorMapForPagedKVCache(pool, dtype, validElemsPerHead, nbKHeads,
-                                                       tokensPerPage, partElemsK, tokensPerTile);
-  auto const tensorMapV = makeTensorMapForPagedKVCache(pool, dtype, validElemsPerHead, nbKHeads,
-                                                       tokensPerPage, partElemsV, tokensPerTile);
-#endif
+      vCacheVLLM, dtype, validElemsPerHead, nbKHeads, tokensPerPage, partElemsV, tokensPerTile,
+      kv_stride_page, kv_stride_token, kv_stride_head);
 
   uint32_t const nbCgas = exactDiv(dimGrid.x, 4) * dimGrid.y * dimGrid.z;
   auto const cgaXBuf = static_cast<Vec<CgaXBuffer, nbProducerCtasPerCga>*>(scratch);
   auto const partialResults = reinterpret_cast<PartialResult*>(cgaXBuf + nbCgas);
-  cudaError_t const err = cudaLaunchKernelEx(&launchCfg, &kernel_mha, tensorMapQ, tensorMapK,
-                                             tensorMapV, qScale, output, cacheList, batchSize,
-                                             kvCacheScale, cgaXBuf, semaphores, partialResults);
-#else
-  KVCacheList<false> const cacheList{kvCacheData, seqLen, maxSeqLen};
-  static_assert(!usePagedKVCache);
-  assert(gemm0CtaTileNbTokens == gemm1CtaTileNbTokens);
-  auto const tensorMap = makeTensorMapForContiguousKVCache(
-      kvCacheData, CU_TENSOR_MAP_DATA_TYPE_UINT8, validElemsPerHead, nbKHeads, maxSeqLen, beamWidth,
-      batchSize, gemm0CtaTileNbTokens);
-  cudaLaunchKernelEx(&launchCfg, kernel_mha, nbKHeads,
-#if SLIDING_WINDOW
-                     slidingWinSize,
-#endif
-                     qScale, output,
-#if LOW_PREC_OUTPUT
-                     rcpOutScale,
-#endif
-#if USE_INPUT_KV
-                     qkv,
-#if ROPE_STYLE != 0
-                     ropeCosSin,
-#endif
-#else
-                     q,
-#endif
-                     cacheList,
-#if USE_BEAM_SEARCH
-                     beamSearchParams,
-#endif
-                     batchSize, kvCacheScale, tensorMap, semaphores, scratch);
-#endif
+  cudaError_t const err = cudaLaunchKernelEx(
+      &launchCfg, &kernel_mha, tensorMapQ, tensorMapK, tensorMapV, qScale, qScalePtr, output,
+      cacheList, batchSize, kvCacheScale, kvScalePtr, cgaXBuf, semaphores, partialResults);
   checkCuda(err);
 #endif
 }
-#endif
diff --git a/csrc/xqa/mma.cuh b/csrc/xqa/mma.cuh
index fac96843aa..c8e425f213 100644
--- a/csrc/xqa/mma.cuh
+++ b/csrc/xqa/mma.cuh
@@ -1,13 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights
- * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #pragma once
diff --git a/csrc/xqa/platform.h b/csrc/xqa/platform.h
index cb1a9e7c58..797d4234ab 100644
--- a/csrc/xqa/platform.h
+++ b/csrc/xqa/platform.h
@@ -1,13 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #pragma once
diff --git a/csrc/xqa/specDec.h b/csrc/xqa/specDec.h
index 7a4131a59c..22e1e9c566 100644
--- a/csrc/xqa/specDec.h
+++ b/csrc/xqa/specDec.h
@@ -1,13 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights
- * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #pragma once
diff --git a/csrc/xqa/tensorMap.cpp b/csrc/xqa/tensorMap.cpp
index e79272b018..3e76635308 100644
--- a/csrc/xqa/tensorMap.cpp
+++ b/csrc/xqa/tensorMap.cpp
@@ -75,28 +75,19 @@ CUtensorMap makeTensorMapForContiguousKVCache(void const* addr, CUtensorMapDataT
 CUtensorMap makeTensorMapForPagedKVCache(void const* addr, CUtensorMapDataType_enum dataType,
                                          uint32_t headElems, uint32_t nbKHeads,
                                          uint32_t tokensPerPage, uint32_t partElems,
-                                         uint32_t nbTokensPerTile) {
+                                         uint32_t nbTokensPerTile, uint64_t stride_page,
+                                         uint64_t stride_token, uint64_t stride_head) {
   CUtensorMap tensorMap{};
   uint32_t elemBytes = getElemBytes(dataType);
-// VLLM Layout
-#if PAGED_KV_CACHE_LAYOUT == 1
+  // VLLM Layout
   uint64_t const globalDims[] = {headElems, nbKHeads, tokensPerPage, 1U << 31};
   uint32_t const headBytes = elemBytes * headElems;
-  uint64_t const globalStrides[] = {headBytes, headBytes * nbKHeads,
-                                    headBytes * nbKHeads * tokensPerPage};
+  // Use provided strides (in elements) and convert to bytes
+  uint64_t const globalStrides[] = {stride_head * elemBytes, stride_token * elemBytes,
+                                    stride_page * elemBytes};
   uint32_t const partBytes = partElems * elemBytes;
   uint32_t const boxDims[] = {partElems, 1, mha::min(tokensPerPage, nbTokensPerTile), 1};
   uint32_t const elemStrides[] = {1, 1, 1, 1};
-  // XQA Original Layout
-#else
-  uint64_t const globalDims[] = {headElems, tokensPerPage, nbKHeads, 1U << 31};
-  uint32_t const headBytes = elemBytes * headElems;
-  uint64_t const globalStrides[] = {headBytes, headBytes * tokensPerPage,
-                                    headBytes * tokensPerPage * nbKHeads};
-  uint32_t const partBytes = partElems * elemBytes;
-  uint32_t const boxDims[] = {partElems, mha::min(tokensPerPage, nbTokensPerTile), 1, 1};
-  uint32_t const elemStrides[] = {1, 1, 1, 1};
-#endif
 
   auto const swizzle = [&] {
     switch (partBytes) {
diff --git a/csrc/xqa/tensorMap.h b/csrc/xqa/tensorMap.h
index d0b2c76b96..aae90c5466 100644
--- a/csrc/xqa/tensorMap.h
+++ b/csrc/xqa/tensorMap.h
@@ -13,4 +13,5 @@ CUtensorMap makeTensorMapForContiguousKVCache(void const* addr, CUtensorMapDataT
 CUtensorMap makeTensorMapForPagedKVCache(void const* addr, CUtensorMapDataType_enum dataType,
                                          uint32_t headElems, uint32_t nbKHeads,
                                          uint32_t tokensPerPage, uint32_t partElems,
-                                         uint32_t nbTokensPerTile);
+                                         uint32_t nbTokensPerTile, uint64_t stride_page,
+                                         uint64_t stride_token, uint64_t stride_head);
diff --git a/csrc/xqa/tma.h b/csrc/xqa/tma.h
index 5cf67238a2..d0137ffefc 100644
--- a/csrc/xqa/tma.h
+++ b/csrc/xqa/tma.h
@@ -1,13 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #pragma once
diff --git a/csrc/xqa/utils.cuh b/csrc/xqa/utils.cuh
index ff65762caf..a9ac1805b9 100644
--- a/csrc/xqa/utils.cuh
+++ b/csrc/xqa/utils.cuh
@@ -1,13 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights
- * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #pragma once
@@ -42,11 +47,12 @@ inline constexpr int32_t kBAD_PAGE_INDEX = -1;
 __constant__ constexpr float kE4M3_MAX = 448.F;
 
 #ifdef __CUDA_ARCH__
-#if __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 890 || __CUDA_ARCH__ == 1200
+#if __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 890 || __CUDA_ARCH__ == 1200 || __CUDA_ARCH__ == 1210
 constexpr uint32_t kMAX_SMEM_SIZE = (99u << 10);
 #elif __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 870
 constexpr uint32_t kMAX_SMEM_SIZE = (163u << 10);
-#elif __CUDA_ARCH__ == 900 || __CUDA_ARCH__ == 1000 || __CUDA_ARCH__ == 1030
+#elif __CUDA_ARCH__ == 900 || __CUDA_ARCH__ == 1000 || __CUDA_ARCH__ == 1030 || \
+    __CUDA_ARCH__ == 1100
 constexpr uint32_t kMAX_SMEM_SIZE = (227u << 10);
 #endif
 #endif
diff --git a/csrc/xqa/utils.h b/csrc/xqa/utils.h
index d685b72d0d..45e18d3a2b 100644
--- a/csrc/xqa/utils.h
+++ b/csrc/xqa/utils.h
@@ -1,13 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights
- * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #pragma once
diff --git a/csrc/xqa/xqa_wrapper.cu b/csrc/xqa/xqa_wrapper.cu
index 089a118541..560882bd9f 100644
--- a/csrc/xqa/xqa_wrapper.cu
+++ b/csrc/xqa/xqa_wrapper.cu
@@ -20,77 +20,86 @@
 using tvm::ffi::Optional;
 
 #if MLA_WRAPPER
-void xqa_wrapper_mla(int64_t multiProcessorCount, double qScale, TensorView output, TensorView q,
-#if PAGED_KV_CACHE_LAYOUT == 1
-                     TensorView kCacheVLLM, TensorView vCacheVLLM,
-#else
-                     TensorView pool,
-#endif
+void xqa_wrapper_mla(int64_t multiProcessorCount, double qScale, Optional<TensorView> qScaleTensor,
+                     TensorView output, TensorView q, TensorView kCacheVLLM, TensorView vCacheVLLM,
                      TensorView kvCachePageList, int64_t maxSeqLen, TensorView seqLen,
-                     int64_t batchSize, TensorView kvCacheScale, TensorView semaphores,
-                     TensorView scratch) {
+                     int64_t batchSize, double kvCacheScale, Optional<TensorView> kvScaleTensor,
+                     TensorView semaphores, TensorView scratch, bool enable_pdl) {
   auto stream = get_stream(output.device());
+  float const* qScalePtr = qScaleTensor.has_value()
+                               ? reinterpret_cast<float const*>(qScaleTensor.value().data_ptr())
+                               : nullptr;
+  float const* kvScalePtr = kvScaleTensor.has_value()
+                                ? reinterpret_cast<float const*>(kvScaleTensor.value().data_ptr())
+                                : nullptr;
+  // Extract strides from TensorView (in elements, not bytes)
+  uint64_t kv_stride_page = kCacheVLLM.stride(0);
+  uint64_t kv_stride_token = kCacheVLLM.stride(-2);
+  uint64_t kv_stride_head = kCacheVLLM.stride(-3);
 
-  launchMLAFlashInfer(multiProcessorCount, 1, qScale,
+  launchMLAFlashInfer(multiProcessorCount, 1, qScale, qScalePtr,
                       reinterpret_cast<OutputHead*>(output.data_ptr()),
                       reinterpret_cast<InputHead const*>(q.data_ptr()),
-#if PAGED_KV_CACHE_LAYOUT == 1
                       reinterpret_cast<GMemCacheHead*>(kCacheVLLM.data_ptr()),
                       reinterpret_cast<GMemCacheHead*>(vCacheVLLM.data_ptr()),
-#else
-                      reinterpret_cast<GMemCacheHead*>(pool.data_ptr()),
-#endif
                       reinterpret_cast<KVCachePageIndex const*>(kvCachePageList.data_ptr()),
                       maxSeqLen, reinterpret_cast<uint32_t const*>(seqLen.data_ptr()), batchSize,
-                      reinterpret_cast<float const*>(kvCacheScale.data_ptr()),
-                      reinterpret_cast<uint32_t*>(semaphores.data_ptr()),
-                      reinterpret_cast<void*>(scratch.data_ptr()), stream);
+                      kvCacheScale, kvScalePtr, reinterpret_cast<uint32_t*>(semaphores.data_ptr()),
+                      reinterpret_cast<void*>(scratch.data_ptr()), enable_pdl, kv_stride_page,
+                      kv_stride_token, kv_stride_head, stream);
 }
 #else
 
 void xqa_wrapper(bool run_sm90_fp8_mha, int64_t multiProcessorCount, int64_t nbKHeads,
-                 int64_t slidingWinSize, double qScale, TensorView output,
-#if LOW_PREC_OUTPUT
-                 TensorView rcpOutScale,
-#endif
-                 TensorView q, Optional<TensorView> attentionSinks,
-#if PAGED_KV_CACHE_LAYOUT == 1
-                 TensorView kCacheVLLM, TensorView vCacheVLLM,
-#else
-                 TensorView pool,
-#endif
+                 int64_t slidingWinSize, double qScale, Optional<TensorView> qScaleTensor,
+                 TensorView output, double rcpOutScale, TensorView q,
+                 Optional<TensorView> attentionSinks, TensorView kCacheVLLM, TensorView vCacheVLLM,
                  TensorView kvCachePageList, int64_t maxSeqLen, TensorView seqLen,
-                 int64_t batchSize, TensorView kvCacheScale,
-#if SPEC_DEC
-                 int64_t qSeqLen, TensorView qCuSeqLens, TensorView mask,
-#endif
-                 TensorView semaphores, TensorView scratch) {
+                 int64_t batchSize, double kvCacheScale, Optional<TensorView> kvScaleTensor,
+                 int64_t qSeqLen, Optional<TensorView> mask, TensorView semaphores,
+                 TensorView scratch, bool enable_pdl) {
   auto stream = get_stream(output.device());
   float const* attentionSinksPtr =
       attentionSinks.has_value() ? reinterpret_cast<float const*>(attentionSinks.value().data_ptr())
                                  : nullptr;
+  float const* qScalePtr = qScaleTensor.has_value()
+                               ? reinterpret_cast<float const*>(qScaleTensor.value().data_ptr())
+                               : nullptr;
+  float const* kvScalePtr = kvScaleTensor.has_value()
+                                ? reinterpret_cast<float const*>(kvScaleTensor.value().data_ptr())
+                                : nullptr;
+#if USE_SM90_MHA
   auto const mha_func = run_sm90_fp8_mha ? &launchHopperF8MHAFlashInfer : &launchMHAFlashInfer;
+#else
+  auto const mha_func = &launchMHAFlashInfer;
+#endif
 
-  mha_func(multiProcessorCount, nbKHeads, slidingWinSize, qScale,
+  // Extract strides from TensorView (in elements, not bytes)
+  uint64_t kv_stride_page = kCacheVLLM.stride(0);
+  uint64_t kv_stride_token = kCacheVLLM.stride(-3);
+  uint64_t kv_stride_head = kCacheVLLM.stride(-2);
+
+#if SPEC_DEC
+  MaskType const* maskPtr =
+      mask.has_value() ? reinterpret_cast<MaskType const*>(mask.value().data_ptr()) : nullptr;
+#endif
+
+  mha_func(multiProcessorCount, nbKHeads, slidingWinSize, qScale, qScalePtr,
            reinterpret_cast<OutputHead*>(output.data_ptr()),
 #if LOW_PREC_OUTPUT
-           reinterpret_cast<float const*>(rcpOutScale.data_ptr()),
+           rcpOutScale,
 #endif
            reinterpret_cast<InputHead const*>(q.data_ptr()), attentionSinksPtr,
-#if PAGED_KV_CACHE_LAYOUT == 1
            reinterpret_cast<GMemCacheHead*>(kCacheVLLM.data_ptr()),
            reinterpret_cast<GMemCacheHead*>(vCacheVLLM.data_ptr()),
-#else
-           reinterpret_cast<GMemCacheHead*>(pool.data_ptr()),
-#endif
            reinterpret_cast<KVCachePageIndex const*>(kvCachePageList.data_ptr()), maxSeqLen,
-           reinterpret_cast<uint32_t const*>(seqLen.data_ptr()), batchSize,
-           reinterpret_cast<float const*>(kvCacheScale.data_ptr()),
+           reinterpret_cast<uint32_t const*>(seqLen.data_ptr()), batchSize, kvCacheScale,
+           kvScalePtr,
 #if SPEC_DEC
-           qSeqLen, reinterpret_cast<uint32_t const*>(qCuSeqLens.data_ptr()),
-           reinterpret_cast<MaskType const*>(mask.data_ptr()),
+           qSeqLen, nullptr, maskPtr,
 #endif
            reinterpret_cast<uint32_t*>(semaphores.data_ptr()),
-           reinterpret_cast<void*>(scratch.data_ptr()), stream);
+           reinterpret_cast<void*>(scratch.data_ptr()), enable_pdl, kv_stride_page, kv_stride_token,
+           kv_stride_head, stream);
 }
 #endif
diff --git a/docker/Dockerfile.cu126 b/docker/Dockerfile.cu126
index e67d1dfbd4..12983310b8 100644
--- a/docker/Dockerfile.cu126
+++ b/docker/Dockerfile.cu126
@@ -25,4 +25,4 @@ COPY docker/install/install_python_packages.sh /install/install_python_packages.
 RUN bash /install/install_python_packages.sh cu126
 
 # Install mpi4py in the conda environment
-RUN conda install -n py312 -y mpi4py
+RUN conda install -n py312 -y mpi4py mpich
diff --git a/docker/Dockerfile.cu126.dev b/docker/Dockerfile.cu126.dev
index 9b8b1ccd49..816d5af619 100644
--- a/docker/Dockerfile.cu126.dev
+++ b/docker/Dockerfile.cu126.dev
@@ -50,7 +50,7 @@ COPY docker/install/install_python_packages.sh /install/install_python_packages.
 RUN bash /install/install_python_packages.sh cu126 && pip3 install pre-commit
 
 # Install mpi4py in the conda environment
-RUN conda install -n py312 -y mpi4py
+RUN conda install -n py312 -y mpi4py mpich
 
 # Install oh-my-zsh
 RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended
diff --git a/docker/Dockerfile.cu128 b/docker/Dockerfile.cu128
index ed1bb194db..44fcc933c7 100644
--- a/docker/Dockerfile.cu128
+++ b/docker/Dockerfile.cu128
@@ -25,4 +25,4 @@ COPY docker/install/install_python_packages.sh /install/install_python_packages.
 RUN bash /install/install_python_packages.sh cu128
 
 # Install mpi4py in the conda environment
-RUN conda install -n py312 -y mpi4py
+RUN conda install -n py312 -y mpi4py mpich
diff --git a/docker/Dockerfile.cu128.dev b/docker/Dockerfile.cu128.dev
index ad734724d5..4fb71d9eeb 100644
--- a/docker/Dockerfile.cu128.dev
+++ b/docker/Dockerfile.cu128.dev
@@ -50,7 +50,7 @@ COPY docker/install/install_python_packages.sh /install/install_python_packages.
 RUN bash /install/install_python_packages.sh cu128 && pip3 install pre-commit
 
 # Install mpi4py in the conda environment
-RUN conda install -n py312 -y mpi4py
+RUN conda install -n py312 -y mpi4py mpich
 
 # Install oh-my-zsh
 RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended
diff --git a/docker/Dockerfile.cu129 b/docker/Dockerfile.cu129
index b395f201c4..03f8d048ce 100644
--- a/docker/Dockerfile.cu129
+++ b/docker/Dockerfile.cu129
@@ -28,4 +28,4 @@ COPY docker/install/install_python_packages.sh /install/install_python_packages.
 RUN bash /install/install_python_packages.sh cu129
 
 # Install mpi4py in the conda environment
-RUN conda install -n py312 -y mpi4py
+RUN conda install -n py312 -y mpi4py mpich
diff --git a/docker/Dockerfile.cu129.dev b/docker/Dockerfile.cu129.dev
index 59c5368932..bfba3d95e6 100644
--- a/docker/Dockerfile.cu129.dev
+++ b/docker/Dockerfile.cu129.dev
@@ -50,7 +50,7 @@ COPY docker/install/install_python_packages.sh /install/install_python_packages.
 RUN bash /install/install_python_packages.sh cu129 && pip3 install pre-commit
 
 # Install mpi4py in the conda environment
-RUN conda install -n py312 -y mpi4py
+RUN conda install -n py312 -y mpi4py mpich
 
 # Install oh-my-zsh
 RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended
diff --git a/docker/Dockerfile.cu130 b/docker/Dockerfile.cu130
index e9d08511ad..1ac138acc6 100644
--- a/docker/Dockerfile.cu130
+++ b/docker/Dockerfile.cu130
@@ -25,8 +25,7 @@ ENV TRITON_PTXAS_PATH="/usr/local/cuda/bin/ptxas"
 # Install torch and other python packages
 COPY requirements.txt /install/requirements.txt
 COPY docker/install/install_python_packages.sh /install/install_python_packages.sh
-# use nightly/cu130 temporarily and change to cu130 when torch releases stable version
-RUN bash /install/install_python_packages.sh nightly/cu130
+RUN bash /install/install_python_packages.sh cu130
 
 # Install mpi4py in the conda environment
-RUN conda install -n py312 -y mpi4py
+RUN conda install -n py312 -y mpi4py mpich
diff --git a/docker/Dockerfile.cu130.dev b/docker/Dockerfile.cu130.dev
index 2bef85754f..56762e242b 100644
--- a/docker/Dockerfile.cu130.dev
+++ b/docker/Dockerfile.cu130.dev
@@ -47,11 +47,10 @@ ENV PATH="/home/$USERNAME/conda/envs/py312/bin:$PATH"
 # Install torch and other python packages
 COPY requirements.txt /install/requirements.txt
 COPY docker/install/install_python_packages.sh /install/install_python_packages.sh
-# use nightly/cu130 temporarily and change to cu130 when torch releases stable version
-RUN bash /install/install_python_packages.sh nightly/cu130 && pip3 install pre-commit
+RUN bash /install/install_python_packages.sh cu130 && pip3 install pre-commit
 
 # Install mpi4py in the conda environment
-RUN conda install -n py312 -y mpi4py
+RUN conda install -n py312 -y mpi4py mpich
 
 # Install oh-my-zsh
 RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended
diff --git a/docker/install/install_python_packages.sh b/docker/install/install_python_packages.sh
index 465b9bf572..54f9095061 100644
--- a/docker/install/install_python_packages.sh
+++ b/docker/install/install_python_packages.sh
@@ -22,16 +22,17 @@ set -u
 # Accept CUDA version as parameter (e.g., cu126, cu128, cu129)
 CUDA_VERSION=${1:-cu128}
 
-# Install requirements.txt first, so we can override any versions later for docker image
-pip3 install -r /install/requirements.txt
+# Install torch with specific CUDA version first, followed by others in requirements.txt, and then others.
+# This is to ensure that the torch version is compatible with the CUDA version.
 pip3 install --force-reinstall torch --index-url https://download.pytorch.org/whl/${CUDA_VERSION}
+pip3 install -r /install/requirements.txt
 pip3 install responses pytest scipy build cuda-python nvidia-nvshmem-cu12
 
 # Install cudnn package based on CUDA version
 if [[ "$CUDA_VERSION" == *"cu13"* ]]; then
   pip3 install --upgrade cuda-python==13.0
-  pip3 install "nvidia-cudnn-cu13>=9.12.0.46"
+  pip3 install "nvidia-cudnn-cu13>=9.14.0.64"
 else
   pip3 install --upgrade cuda-python==12.*
-  pip3 install "nvidia-cudnn-cu12>=9.11.0.98"
+  pip3 install "nvidia-cudnn-cu12>=9.14.0.64"
 fi
diff --git a/docs/api/attention.rst b/docs/api/attention.rst
index bb65664c83..eff9160787 100644
--- a/docs/api/attention.rst
+++ b/docs/api/attention.rst
@@ -47,6 +47,7 @@ XQA
     :toctree: ../generated
 
     xqa
+    xqa_mla
 
 flashinfer.prefill
 ==================
@@ -98,6 +99,11 @@ and `DeepSeek-R1 <https://arxiv.org/abs/2501.12948>`_).
 PageAttention for MLA
 ---------------------
 
+.. autosummary::
+    :toctree: ../generated
+
+    trtllm_batch_decode_with_kv_cache_mla
+
 .. autoclass:: BatchMLAPagedAttentionWrapper
     :members:
 
diff --git a/docs/index.rst b/docs/index.rst
index 6a5a9c6a19..f4e61d26c4 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -15,6 +15,7 @@ FlashInfer is a library and kernel generator for Large Language Models that prov
    :caption: Get Started
 
    installation
+   logging
 
 .. toctree::
    :maxdepth: 2
diff --git a/docs/installation.rst b/docs/installation.rst
index 7550a73622..e2e208206c 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -13,7 +13,12 @@ Prerequisites
 
 - OS: Linux only
 
-- Python: 3.9, 3.10, 3.11, 3.12, 3.13
+- Python: 3.10, 3.11, 3.12, 3.13, 3.14
+
+- CUDA: 12.6, 12.8, 13.0, 13.1
+
+.. note::
+   FlashInfer strives to follow PyTorch's supported CUDA versions plus the latest CUDA release.
 
 Quick Start
 ^^^^^^^^^^^
@@ -92,7 +97,7 @@ You can follow the steps below to install FlashInfer from source code:
 
    .. code-block:: bash
 
-       export FLASHINFER_CUDA_ARCH_LIST="7.5 8.0 8.9 10.0a 10.3a 12.0a"
+       export FLASHINFER_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 10.3a 11.0a 12.0f"
        cd flashinfer-jit-cache
        python -m build --no-isolation --wheel
        python -m pip install dist/*.whl
diff --git a/docs/logging.rst b/docs/logging.rst
new file mode 100644
index 0000000000..c3c2c83d8f
--- /dev/null
+++ b/docs/logging.rst
@@ -0,0 +1,118 @@
+.. _logging:
+
+Logging
+=======
+
+FlashInfer provides a logging feature to help debug issues and reproduce crashes. This document describes all available logging levels and their features.
+
+Quick Start
+-----------
+
+Enable logging using two environment variables:
+
+.. code-block:: bash
+
+    # Set logging level (0-5)
+    export FLASHINFER_LOGLEVEL=3
+
+    # Set log destination (default is stdout)
+    export FLASHINFER_LOGDEST=stdout  # or stderr, or a file path like "flashinfer.log"
+
+Logging Levels
+--------------
+
+.. list-table::
+   :header-rows: 1
+   :widths: 10 20 35 25
+
+   * - Level
+     - Name
+     - Features
+     - Use Case
+   * - **0**
+     - Disabled (Default)
+     - No logging (zero overhead)
+     - Production
+   * - **1**
+     - Function Names
+     - Function names only
+     - Basic tracing
+   * - **3**
+     - Inputs/Outputs
+     - Function names + arguments + outputs with metadata
+     - Standard debugging
+   * - **5**
+     - Statistics
+     - Level 3 + tensor statistics (min, max, mean, NaN/Inf counts)
+     - Numerical analysis
+
+Environment Variables
+---------------------
+
+Main Configuration
+^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+   :header-rows: 1
+   :widths: 30 15 15 40
+
+   * - Variable
+     - Type
+     - Default
+     - Description
+   * - ``FLASHINFER_LOGLEVEL``
+     - int
+     - 0
+     - Logging level (0, 1, 3, 5)
+   * - ``FLASHINFER_LOGDEST``
+     - str
+     - ``stdout``
+     - Log destination: ``stdout``, ``stderr``, or file path
+
+Process ID Substitution
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use ``%i`` in file paths for automatic process ID substitution (useful for multi-GPU training):
+
+.. code-block:: bash
+
+    export FLASHINFER_LOGDEST="flashinfer_log_%i.txt"  # → flashinfer_log_12345.txt
+
+
+Miscellaneous Notes and Examples
+---------------------------------
+
+CUDA Graph Compatibility
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Level 5 statistics are **automatically skipped during CUDA graph capture** to avoid synchronization issues.
+
+.. code-block:: python
+
+    # This works correctly - no synchronization errors
+    with torch.cuda.graph(cuda_graph):
+        result = mm_fp4(a, b, scales, ...)  # Level 5 logging active
+        # Statistics automatically skipped during capture
+
+Output shows: ``[statistics skipped: CUDA graph capture in progress]``
+
+Process IDs for Multi-GPU Environments
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+    # Use %i for process ID substitution
+    export FLASHINFER_LOGLEVEL=3
+    export FLASHINFER_LOGDEST="logs/flashinfer_api_%i.log"
+
+    torchrun --nproc_per_node=8 awesome_script_that_uses_FlashInfer.py
+
+    # Creates separate logs:
+    # logs/flashinfer_api_12345.log (rank 0)
+    # logs/flashinfer_api_12346.log (rank 1)
+    # ...
+
+Level 0 has zero overhead
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+At Level 0, the decorator returns the original function unchanged. No wrapper, no checks, no overhead.
diff --git a/flashinfer-cubin/pyproject.toml b/flashinfer-cubin/pyproject.toml
index 866ff08db2..2bc526a4b3 100644
--- a/flashinfer-cubin/pyproject.toml
+++ b/flashinfer-cubin/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=61.0", "wheel", "requests", "filelock", "torch", "tqdm", "numpy", "apache-tvm-ffi>=0.1,<0.2"]
+requires = ["setuptools>=61.0", "wheel", "requests", "filelock", "torch", "tqdm", "numpy", "apache-tvm-ffi>=0.1,<0.2", "nvidia-ml-py"]
 build-backend = "build_backend"
 backend-path = ["."]
 
diff --git a/flashinfer/__init__.py b/flashinfer/__init__.py
index 8cedc9261e..faad4f12a3 100644
--- a/flashinfer/__init__.py
+++ b/flashinfer/__init__.py
@@ -100,6 +100,7 @@
 from .page import get_batch_indices_positions as get_batch_indices_positions
 from .page import get_seq_lens as get_seq_lens
 from .pod import PODWithPagedKVCacheWrapper as PODWithPagedKVCacheWrapper
+from .pod import BatchPODWithPagedKVCacheWrapper as BatchPODWithPagedKVCacheWrapper
 from .prefill import (
     BatchPrefillWithPagedKVCacheWrapper as BatchPrefillWithPagedKVCacheWrapper,
 )
diff --git a/flashinfer/activation.py b/flashinfer/activation.py
index 67e763fefa..35abb2fdba 100644
--- a/flashinfer/activation.py
+++ b/flashinfer/activation.py
@@ -20,6 +20,7 @@
 
 import torch
 
+from .api_logging import flashinfer_api
 from .jit import gen_act_and_mul_module
 from .utils import (
     device_support_pdl,
@@ -66,6 +67,7 @@ def _check_shape(input: torch.Tensor, output: torch.Tensor) -> None:
     )
 
 
+@flashinfer_api
 def silu_and_mul(
     input: torch.Tensor, out: torch.Tensor = None, enable_pdl: Optional[bool] = None
 ) -> torch.Tensor:
@@ -110,6 +112,7 @@ def silu_and_mul(
     return out
 
 
+@flashinfer_api
 def gelu_tanh_and_mul(
     input: torch.Tensor, out: torch.Tensor = None, enable_pdl: Optional[bool] = None
 ) -> torch.Tensor:
@@ -150,6 +153,7 @@ def gelu_tanh_and_mul(
     return out
 
 
+@flashinfer_api
 def gelu_and_mul(
     input: torch.Tensor, out: torch.Tensor = None, enable_pdl: Optional[bool] = None
 ) -> torch.Tensor:
@@ -190,6 +194,7 @@ def gelu_and_mul(
     return out
 
 
+@flashinfer_api
 def silu_and_mul_scaled_nvfp4_experts_quantize(
     a,
     mask,
diff --git a/flashinfer/aot.py b/flashinfer/aot.py
index 58d55264d0..ad64ac2c11 100644
--- a/flashinfer/aot.py
+++ b/flashinfer/aot.py
@@ -43,6 +43,7 @@
 from .jit.fp8_quantization import gen_mxfp8_quantization_sm100_module
 from .jit.fused_moe import (
     gen_cutlass_fused_moe_sm120_module,
+    gen_cutlass_fused_moe_sm103_module,
     gen_cutlass_fused_moe_sm100_module,
     gen_cutlass_fused_moe_sm90_module,
     gen_trtllm_gen_fused_moe_sm100_module,
@@ -404,6 +405,7 @@ def gen_xqa(
             head_dim=head_size,
             head_group_ratio=head_grp_size,
             use_sliding_window=use_sliding_window,
+            output_dtype=input_type,
         )
 
     if has_sm120 or has_sm121:
@@ -494,6 +496,7 @@ def gen_all_modules(
             jit_specs.append(gen_tgv_gemm_sm10x_module(torch.float16, use_sm_100f=True))
         if has_sm103:
             jit_specs.append(gen_fp4_quantization_sm103_module())
+            jit_specs.append(gen_cutlass_fused_moe_sm103_module())
         if has_sm110:
             jit_specs.append(gen_fp4_quantization_sm110_module())
         if has_sm120:
@@ -739,9 +742,9 @@ def has_sm(compute: str, version: str) -> bool:
         "sm100": has_sm("compute_100", "12.8"),
         "sm100f": has_sm("compute_100", "12.9"),
         "sm103": has_sm("compute_103", "12.9"),
-        "sm110": has_sm("compute_110", "12.9"),
-        "sm120": has_sm("compute_120", "13.0"),
-        "sm121": has_sm("compute_121", "13.0"),
+        "sm110": has_sm("compute_110", "13.0"),
+        "sm120": has_sm("compute_120", "12.8"),
+        "sm121": has_sm("compute_121", "12.9"),
     }
 
 
diff --git a/flashinfer/api_logging.py b/flashinfer/api_logging.py
new file mode 100644
index 0000000000..734d6bae28
--- /dev/null
+++ b/flashinfer/api_logging.py
@@ -0,0 +1,565 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import enum
+import functools
+import inspect
+import logging
+import os
+import sys
+from typing import Any, Callable
+import contextlib
+import torch
+
+
+# Helper function to substitute %i with process ID in file paths
+def _substitute_process_id(path: str) -> str:
+    """
+    Replace %i with the current process ID in a path.
+
+    This is useful for multi-process/multi-GPU environments where each process
+    needs its own log file.
+    """
+    if "%i" in path:
+        return path.replace("%i", str(os.getpid()))
+    return path
+
+
+# Read environment variables once at module load time
+_API_LOG_LEVEL = int(os.environ.get("FLASHINFER_LOGLEVEL", "0"))
+_API_LOG_DEST = _substitute_process_id(os.environ.get("FLASHINFER_LOGDEST", "stdout"))
+
+# Create logger using Python's logging library
+_logger = logging.getLogger("flashinfer.api")
+
+
+def _setup_logger():
+    """Set up the logger based on environment variables."""
+    if _API_LOG_LEVEL == 0:
+        # Completely disable logging for zero overhead
+        _logger.addHandler(logging.NullHandler())
+        _logger.setLevel(logging.CRITICAL + 1)  # Higher than any level
+        return
+
+    # All enabled levels use loggging.DEBUG; verbosity is controlled by FLASHINFER_LOGLEVEL instead
+    _logger.setLevel(logging.DEBUG)
+
+    # Remove any existing handlers
+    _logger.handlers.clear()
+
+    # Create handler based on destination
+    if _API_LOG_DEST == "stdout":
+        handler = logging.StreamHandler(sys.stdout)
+    elif _API_LOG_DEST == "stderr":
+        handler = logging.StreamHandler(sys.stderr)
+    else:
+        handler = logging.FileHandler(_API_LOG_DEST, mode="a")
+
+    # Use a simple formatter (we'll add timestamps manually to key lines)
+    formatter = logging.Formatter("%(message)s")
+    handler.setFormatter(formatter)
+
+    _logger.addHandler(handler)
+    _logger.propagate = False  # Don't propagate to root logger
+
+
+# Initialize logger at module load time
+_setup_logger()
+
+
+def _get_timestamp() -> str:
+    """Get current timestamp in the format [YYYY-MM-DD HH:MM:SS]."""
+    from datetime import datetime
+
+    return datetime.now().strftime("[%Y-%m-%d %H:%M:%S]")
+
+
+def _log_system_info():
+    """Log system information once at module initialization."""
+    if _API_LOG_LEVEL == 0:
+        return
+
+    lines = []
+    lines.append("=" * 80)
+    lines.append(f"{_get_timestamp()} FlashInfer API Logging - System Information")
+    lines.append("=" * 80)
+
+    try:
+        # FlashInfer version
+        try:
+            from .version import __version__ as flashinfer_version
+
+            lines.append(f"FlashInfer version: {flashinfer_version}")
+        except Exception:
+            lines.append("FlashInfer version: <unavailable>")
+
+        # CUDA toolkit version
+        cuda_version = torch.version.cuda
+        if cuda_version:
+            lines.append(f"CUDA toolkit version: {cuda_version}")
+        else:
+            lines.append("CUDA toolkit version: <unavailable - CPU-only build?>")
+
+        # cuDNN version
+        try:
+            if torch.backends.cudnn.is_available():
+                cudnn_version = torch.backends.cudnn.version()
+                if cudnn_version:
+                    lines.append(f"cuDNN version: {cudnn_version}")
+                else:
+                    lines.append("cuDNN version: <unavailable>")
+            else:
+                lines.append("cuDNN version: <not available>")
+        except Exception as e:
+            lines.append(f"cuDNN version: <error: {e}>")
+
+        # GPU information (if CUDA is available)
+        if torch.cuda.is_available():
+            device_count = torch.cuda.device_count()
+            lines.append(f"Number of GPUs: {device_count}")
+
+            # Log information for each GPU
+            for i in range(device_count):
+                try:
+                    gpu_name = torch.cuda.get_device_name(i)
+                    capability = torch.cuda.get_device_capability(i)
+                    sm_arch = capability[0] * 10 + capability[1]
+                    lines.append(f"  GPU {i}: {gpu_name}")
+                    lines.append(
+                        f"    Compute capability: {capability[0]}.{capability[1]} (SM{sm_arch})"
+                    )
+                except Exception as e:
+                    lines.append(f"  GPU {i}: <error: {e}>")
+        else:
+            lines.append("CUDA: Not available (CPU-only mode)")
+
+        # PyTorch version
+        lines.append(f"PyTorch version: {torch.__version__}")
+
+    except Exception as e:
+        lines.append(f"Error gathering system information: {e}")
+
+    lines.append("=" * 80)
+    lines.append("")  # Empty line for readability
+
+    _logger.debug("\n".join(lines))
+
+
+# Log system information once at module load time (if logging is enabled)
+_log_system_info()
+
+
+def _format_value(value: Any, level: int, indent: int = 0) -> str:
+    """
+    Format a value for logging based on the log level.
+
+    Parameters
+    ----------
+    value : Any
+        The value to format
+    level : int
+        The logging level (1, 2, or 3)
+    indent : int
+        The indentation level for nested structures
+
+    Returns
+    -------
+    str
+        Formatted string representation of the value
+    """
+    indent_str = "  " * indent
+
+    # Handle None
+    if value is None:
+        return f"{indent_str}None"
+
+    # Handle Enum types
+    if isinstance(value, enum.Enum):
+        # Show both the name and value of the enum
+        return (
+            f"{indent_str}{value.__class__.__name__}.{value.name} (value={value.value})"
+        )
+
+    # Handle torch.Tensor
+    if isinstance(value, torch.Tensor):
+        if level == 1:
+            return f"{indent_str}Tensor(...)"
+
+        # Level 3+: Show metadata
+        lines = [f"{indent_str}Tensor("]
+        lines.append(f"{indent_str}  shape={tuple(value.shape)}")
+        lines.append(f"{indent_str}  stride={tuple(value.stride())}")
+        lines.append(f"{indent_str}  dtype={value.dtype}")
+        lines.append(f"{indent_str}  device={value.device}")
+        lines.append(f"{indent_str}  requires_grad={value.requires_grad}")
+        lines.append(f"{indent_str}  is_contiguous={value.is_contiguous()}")
+
+        # Level 5: Add statistics
+        if level >= 5:
+            try:
+                # Skip statistics if we're in CUDA graph capture mode
+                # (operations like .min()/.max()/.mean() cause synchronization issues)
+                is_capturing = False
+                if value.is_cuda and hasattr(torch.cuda, "is_current_stream_capturing"):
+                    with contextlib.suppress(Exception):
+                        is_capturing = torch.cuda.is_current_stream_capturing()
+
+                if is_capturing:
+                    lines.append(
+                        f"{indent_str}  [statistics skipped: CUDA graph capture in progress]"
+                    )
+                elif value.numel() > 0:
+                    # Convert to float for statistics if possible
+                    if value.dtype in [
+                        torch.float16,
+                        torch.float32,
+                        torch.float64,
+                        torch.bfloat16,
+                        torch.float8_e4m3fn,
+                        torch.float8_e5m2,
+                    ]:
+                        val_float = value.float()
+                        lines.append(f"{indent_str}  min={val_float.min().item():.6f}")
+                        lines.append(f"{indent_str}  max={val_float.max().item():.6f}")
+                        lines.append(
+                            f"{indent_str}  mean={val_float.mean().item():.6f}"
+                        )
+                        nan_count = torch.isnan(val_float).sum().item()
+                        lines.append(f"{indent_str}  nan_count={nan_count}")
+                        inf_count = torch.isinf(val_float).sum().item()
+                        lines.append(f"{indent_str}  inf_count={inf_count}")
+                    elif value.dtype in [
+                        torch.int8,
+                        torch.int16,
+                        torch.int32,
+                        torch.int64,
+                        torch.uint8,
+                    ]:
+                        lines.append(f"{indent_str}  min={value.min().item()}")
+                        lines.append(f"{indent_str}  max={value.max().item()}")
+                        lines.append(
+                            f"{indent_str}  mean={value.float().mean().item():.6f}"
+                        )
+            except Exception as e:
+                lines.append(f"{indent_str}  [statistics error: {e}]")
+
+        lines.append(f"{indent_str})")
+        return "\n".join(lines)
+
+    # Handle FP4Tensor (custom FlashInfer type)
+    if hasattr(value, "__class__") and value.__class__.__name__ == "FP4Tensor":
+        if level == 1:
+            return f"{indent_str}FP4Tensor(...)"
+
+        lines = [f"{indent_str}FP4Tensor("]
+        lines.append(
+            f"{indent_str}  data={_format_value(value.data, level, indent + 1)}"
+        )
+        lines.append(
+            f"{indent_str}  scale={_format_value(value.scale, level, indent + 1)}"
+        )
+        lines.append(f"{indent_str}  scale_start_index={value.scale_start_index}")
+        if hasattr(value, "original_shape") and value.original_shape is not None:
+            lines.append(f"{indent_str}  original_shape={value.original_shape}")
+        lines.append(f"{indent_str})")
+        return "\n".join(lines)
+
+    # Handle lists
+    if isinstance(value, list):
+        if len(value) == 0:
+            return f"{indent_str}[]"
+        if level == 1:
+            return f"{indent_str}[list with {len(value)} items]"
+
+        lines = [f"{indent_str}["]
+        for i, item in enumerate(value):
+            lines.append(
+                f"{indent_str}  [{i}]: {_format_value(item, level, indent + 1)}"
+            )
+        lines.append(f"{indent_str}]")
+        return "\n".join(lines)
+
+    # Handle tuples
+    if isinstance(value, tuple):
+        if len(value) == 0:
+            return f"{indent_str}()"
+        if level == 1:
+            return f"{indent_str}(tuple with {len(value)} items)"
+
+        lines = [f"{indent_str}("]
+        for i, item in enumerate(value):
+            lines.append(
+                f"{indent_str}  [{i}]: {_format_value(item, level, indent + 1)}"
+            )
+        lines.append(f"{indent_str})")
+        return "\n".join(lines)
+
+    # Handle dictionaries
+    if isinstance(value, dict):
+        if len(value) == 0:
+            return f"{indent_str}{{}}"
+        if level == 1:
+            return f"{indent_str}{{dict with {len(value)} keys}}"
+
+        lines = [f"{indent_str}{{"]
+        for key, val in value.items():
+            lines.append(
+                f"{indent_str}  {repr(key)}: {_format_value(val, level, indent + 1)}"
+            )
+        lines.append(f"{indent_str}}}")
+        return "\n".join(lines)
+
+    # Handle numeric types (int, float, bool)
+    if isinstance(value, (int, float, bool, complex)):
+        return f"{indent_str}{value}"
+
+    # Handle strings
+    if isinstance(value, str):
+        return f"{indent_str}{repr(value)}"
+
+    # Default: use repr
+    try:
+        return f"{indent_str}{repr(value)}"
+    except Exception:
+        return f"{indent_str}<{type(value).__name__} object>"
+
+
+def _get_default_params(func: Callable, args: tuple, kwargs: dict) -> dict:
+    """
+    Extract parameters that have default values but were not explicitly provided.
+
+    Parameters
+    ----------
+    func : Callable
+        The function being called
+    args : tuple
+        Positional arguments that were provided
+    kwargs : dict
+        Keyword arguments that were provided
+
+    Returns
+    -------
+    dict
+        Dictionary of parameter names to default values for parameters that were not provided
+    """
+    try:
+        sig = inspect.signature(func)
+        default_params = {}
+
+        # Determine which parameters were NOT provided
+        for i, (param_name, param) in enumerate(sig.parameters.items()):
+            # Skip if parameter has no default
+            if param.default is inspect.Parameter.empty:
+                continue
+
+            # Check if this parameter was provided
+            provided = False
+
+            # Check positional args and keyword args
+            if i < len(args) or param_name in kwargs:
+                provided = True
+
+            # If not provided, record the default value
+            if not provided:
+                default_params[param_name] = param.default
+
+        return default_params
+    except Exception:
+        # If we can't inspect the signature, return empty dict
+        return {}
+
+
+def _log_function_inputs(
+    func: Callable, func_name: str, args: tuple, kwargs: dict, level: int
+) -> None:
+    """
+    Log function inputs BEFORE execution for crash safety.
+
+    This ensures inputs are captured even if the function crashes with a CUDA error.
+
+    Parameters
+    ----------
+    func : Callable
+        The function being called (needed to extract default parameters)
+    func_name : str
+        Name of the function being called
+    args : tuple
+        Positional arguments
+    kwargs : dict
+        Keyword arguments
+    level : int
+        Logging level (3 or 5)
+    """
+    lines = []
+    lines.append("=" * 80)
+    lines.append(f"{_get_timestamp()} FlashInfer API Call: {func_name}")
+    lines.append("-" * 80)
+
+    # Log explicitly provided inputs
+    if args or kwargs:
+        # Positional arguments
+        if args:
+            lines.append("Positional input arguments:")
+            for i, arg in enumerate(args):
+                lines.append(f"  arg[{i}]:")
+                lines.append(_format_value(arg, level, indent=2))
+
+        # Keyword arguments
+        if kwargs:
+            lines.append("Keyword input arguments:")
+            for key, value in kwargs.items():
+                lines.append(f"  {key}=")
+                lines.append(_format_value(value, level, indent=2))
+    else:
+        lines.append("(No explicit arguments)")
+
+    # Log default parameters that were not explicitly provided
+    default_params = _get_default_params(func, args, kwargs)
+    if default_params:
+        lines.append("Default parameters (not explicitly provided):")
+        for param_name, default_value in default_params.items():
+            lines.append(f"  {param_name}= [DEFAULT]")
+            lines.append(_format_value(default_value, level, indent=2))
+
+    _logger.debug("\n".join(lines))
+
+
+def _log_function_outputs(func_name: str, result: Any, level: int) -> None:
+    """
+    Log function outputs AFTER successful execution.
+
+    Parameters
+    ----------
+    func_name : str
+        Name of the function
+    result : Any
+        Function return value
+    level : int
+        Logging level (3 or 5)
+    """
+    lines = []
+    # Log outputs
+    lines.append("Output value:")
+    lines.append(_format_value(result, level, indent=1))
+
+    lines.append("=" * 80)
+    lines.append("")  # Empty line for readability
+
+    _logger.debug("\n".join(lines))
+
+
+def flashinfer_api(func: Callable = None) -> Callable:
+    """
+    Decorator to FlashInfer's APIs.
+
+    Currently logs input and output values of the function using Python's logging library.
+    This decorator integrates with Python's standard logging infrastructure while
+    maintaining zero overhead when disabled (FLASHINFER_LOGLEVEL=0).
+
+    NOTE/TODO: Not all FlashInfer APIs are decorated with this decorator yet. This is a work in progress.
+
+    Environment Variables
+    ---------------------
+    FLASHINFER_LOGLEVEL : int (default: 0)
+        - 0: No logging (zero overhead - decorator returns original function)
+        - 1: Log function name only (logged BEFORE execution - crash-safe)
+        - 3: Log function name + inputs/outputs with metadata (inputs logged BEFORE execution - crash-safe)
+        - 5: Log function name + inputs/outputs with metadata + tensor statistics (inputs logged BEFORE execution - crash-safe)
+
+    FLASHINFER_LOGDEST : str (default: "stdout")
+        - "stdout": Log to standard output
+        - "stderr": Log to standard error
+        - <path>: Log to specified file path
+        - Use %i in path for process ID substitution (e.g., "log_%i.txt" -> "log_12345.txt")
+
+    Examples
+    --------
+    Basic usage:
+
+    >>> @flashinfer_api
+    ... def my_function(x, y):
+    ...     return x + y
+
+    Notes
+    -----
+    - Key header lines include a timestamp in the format: [YYYY-MM-DD HH:MM:SS]
+      (e.g., "FlashInfer API Call: function_name", "FlashInfer API Logging - System Information")
+    - When FLASHINFER_LOGLEVEL=0, the decorator has truly zero overhead
+      as it returns the original function unchanged.
+    - Function names and inputs are logged BEFORE execution:
+      - Level 1: Function name only
+      - Levels 3-5: Function name + inputs with metadata
+      This means critical debugging information is preserved even if the function
+      crashes (e.g., CUDA illegal memory access, out-of-bounds, etc.).
+    - Outputs are logged AFTER successful execution for levels 3 and 5.
+    - **CUDA Graph Compatibility**: At level 5, tensor statistics (min/max/mean/nan_count)
+      are automatically skipped during CUDA graph capture to avoid synchronization issues.
+      The message "[statistics skipped: CUDA graph capture in progress]" will be logged.
+    - The %i pattern is automatically replaced with the process ID for multi-process environments.
+    - The logger does not propagate to the root logger to avoid duplicate logs.
+    """
+    # If logging is disabled, return original function with zero overhead
+    if _API_LOG_LEVEL == 0:
+        if func is None:
+            return lambda f: f
+        return func
+
+    def decorator(f: Callable) -> Callable:
+        @functools.wraps(f)
+        def wrapper(*args, **kwargs):
+            # Determine function name (with class name if applicable)
+            func_name = f.__name__
+            if args and hasattr(args[0], "__class__"):
+                try:
+                    class_name = args[0].__class__.__name__
+                    if "Wrapper" in class_name or class_name in [
+                        "BatchMLAPagedAttentionWrapper"
+                    ]:
+                        func_name = f"{class_name}.{func_name}"
+                except Exception:
+                    pass
+
+            # Log BEFORE execution (crash-safe for all levels!)
+            try:
+                if _API_LOG_LEVEL == 1:
+                    # Level 1: Just log function name before execution (crash-safe)
+                    _logger.debug(
+                        f"{_get_timestamp()} FlashInfer API Call: {func_name}"
+                    )
+                elif _API_LOG_LEVEL >= 3:
+                    # Level 3+: Log full inputs before execution (crash-safe)
+                    _log_function_inputs(f, func_name, args, kwargs, _API_LOG_LEVEL)
+            except Exception as e:
+                _logger.error(f"[LOGGING ERROR in {func_name} (pre-execution)]: {e}")
+
+            # Call the original function (may crash here with CUDA errors)
+            result = f(*args, **kwargs)
+
+            # Log outputs AFTER successful execution (level 3+ only)
+            try:
+                if _API_LOG_LEVEL >= 3:
+                    # Level 3+: Log outputs (inputs were already logged above)
+                    _log_function_outputs(func_name, result, _API_LOG_LEVEL)
+            except Exception as e:
+                _logger.error(f"[LOGGING ERROR in {func_name} (outputs)]: {e}")
+
+            return result
+
+        return wrapper
+
+    if func is None:
+        return decorator
+    return decorator(func)
diff --git a/flashinfer/artifacts.py b/flashinfer/artifacts.py
index 25f679968f..717524bc9e 100644
--- a/flashinfer/artifacts.py
+++ b/flashinfer/artifacts.py
@@ -87,31 +87,18 @@ class ArtifactPath:
     When compiling new cubins for backend directories, update the corresponding path.
     """
 
-    TRTLLM_GEN_FMHA: str = "463def7494c9fc6792b5aa5b5beef34025e247ac/fmha/trtllm-gen/"
+    TRTLLM_GEN_FMHA: str = "9f1b6ddaa1592a8339a82fcab7d27a57eff445fd/fmha/trtllm-gen/"
     TRTLLM_GEN_BMM: str = (
-        "56fea80cb22f8b2ef2a2c6a822a075fb20b36803/batched_gemm-074aec4-cc00b23"
+        "ccae3ed120a12a2c6922b458086b460413dbf731/batched_gemm-0d275a2-9936841"
     )
     TRTLLM_GEN_GEMM: str = (
         "1fddc48b7b48af33914d040051b3e2ee9ba4701e/gemm-145d1b1-9b113e3"
     )
     CUDNN_SDPA: str = "a72d85b019dc125b9f711300cb989430f762f5a6/fmha/cudnn/"
+    # For DEEPGEMM, we also need to update KernelMap.KERNEL_MAP_HASH in flashinfer/deep_gemm.py
     DEEPGEMM: str = "a72d85b019dc125b9f711300cb989430f762f5a6/deep-gemm/"
 
 
-@dataclass(frozen=True)
-class MetaInfoHash:
-    DEEPGEMM: str = "f161e031826adb8c4f0d31ddbd2ed77e4909e4e43cdfc9728918162a62fcccfb"
-    TRTLLM_GEN_FMHA: str = (
-        "2b8a485f2af84768bc769e678eb6014a8181ad95a7ea9e699de5efca4b18ec6a"
-    )
-    TRTLLM_GEN_BMM: str = (
-        "4a8ceeb356fc5339021acf884061e97e49e01da5c75dbf0f7cf4932c37a70152"
-    )
-    TRTLLM_GEN_GEMM: str = (
-        "bd5c3227bec4f8d7a7d3a27fd7628e010d99a5c42651d0a6b97e146803e63340"
-    )
-
-
 class CheckSumHash:
     """
     This class is used to store the checksums of the cubin files in artifactory.
@@ -120,10 +107,10 @@ class CheckSumHash:
     """
 
     TRTLLM_GEN_FMHA: str = (
-        "639c534614e9fdf5a9cfa91f7ea8f53989613019c0e1f8b755f461e1fcc7546f"
+        "a5a60600a80076317703695f56bbef2f0a44075ef4e24d7b06ba67ff68bc9da2"
     )
     TRTLLM_GEN_BMM: str = (
-        "8df2aae8f3aa39d64d2c723e775640beb4ac602a6cbb02e497c2a7316e349934"
+        "b7689d3046493806251351c2744c6d7faed6af25518647a955b35c4919b014fc"
     )
     DEEPGEMM: str = "1a2a166839042dbd2a57f48051c82cd1ad032815927c753db269a4ed10d0ffbf"
     TRTLLM_GEN_GEMM: str = (
diff --git a/flashinfer/attention.py b/flashinfer/attention.py
index b1e288b903..c4bc4f27dc 100644
--- a/flashinfer/attention.py
+++ b/flashinfer/attention.py
@@ -20,6 +20,7 @@
 
 import torch
 
+from .api_logging import flashinfer_api
 from .jit import gen_batch_attention_module
 from .utils import (
     MaskMode,
@@ -40,6 +41,7 @@ def get_holistic_attention_module(*args):
 
 
 class BatchAttention:
+    @flashinfer_api
     def __init__(
         self,
         kv_layout: str = "NHD",
@@ -65,6 +67,7 @@ def __init__(
             pin_memory=True,
         )
 
+    @flashinfer_api
     def plan(
         self,
         qo_indptr: torch.Tensor,
@@ -132,6 +135,7 @@ def plan(
             causal,
         )
 
+    @flashinfer_api
     def run(
         self,
         q: torch.Tensor,
diff --git a/flashinfer/autotuner.py b/flashinfer/autotuner.py
index f8af220916..a81c8f2546 100644
--- a/flashinfer/autotuner.py
+++ b/flashinfer/autotuner.py
@@ -61,9 +61,9 @@ def __post_init__(self):
         # Set default tensor_initializers if not provided
         if self.tensor_initializers is None:
             self.tensor_initializers = [
-                lambda shapes, dtype, device: torch.randn(shapes, device=device).to(
-                    dtype
-                )
+                lambda shapes, dtype, device: (
+                    torch.rand(shapes, device=device) * 10 - 5
+                ).to(dtype)
                 for _ in range(len(self.input_idx))
             ]
 
@@ -458,6 +458,13 @@ def choose_one(
         # Record the total configs to try
         self.stats.tuned_op_total_configs[custom_op] = len(profiles)
 
+        # Pre-compute runner arg names to avoid calling inspect.signature in the loop
+        runner_arg_names_map = {}
+        for r in runners:
+            runner_arg_names_map[r] = {
+                param.name for param in inspect.signature(r.forward).parameters.values()
+            }
+
         for p in profiles:
             tensors = self._prepare_input_tensors(p, inputs)
             is_cache_hit, runner_id, tactic, _ = self.search_cache(
@@ -470,9 +477,7 @@ def choose_one(
                 for r_id, r in enumerate(runners):
                     # TODO: use FakeTensor here.
                     valid_tactics = r.get_valid_tactics(tensors, p)
-                    runner_arg_names = {
-                        p.name for p in inspect.signature(r.forward).parameters.values()
-                    }
+                    runner_arg_names = runner_arg_names_map[r]
                     if "do_preparation" in runner_arg_names and len(valid_tactics) > 0:
                         r(tensors, tactic=-1, do_preparation=True, **kwargs)
                     for tac in valid_tactics:
@@ -483,7 +488,7 @@ def choose_one(
                         except Exception as e:
                             shapes = self._get_input_sizes(tensors)
                             logger.warning(
-                                f"[Autotuner]: Skipping tactic {r} {tac}, due to failure while profiling."
+                                f"[Autotuner]: Skipping tactic {r} {tac}, due to failure while profiling: {e}"
                             )
 
                             # Log stacktrace as debug to not spam log
@@ -648,7 +653,9 @@ def _generate_optimization_profiles(
 
             opt_shapes_max = {
                 v1: v2
-                for v1, v2 in zip(opt_shapes, tuple(opt_shapes[1:]) + (float("inf"),))
+                for v1, v2 in zip(
+                    opt_shapes, tuple(opt_shapes[1:]) + (float("inf"),), strict=True
+                )
             }
             dynamic_dims.append(
                 (spec.input_idx, spec.dim_idx, opt_shapes_max, opt_shapes)
@@ -759,8 +766,8 @@ def _create_tensor_like(
     def _prepare_input_tensors(
         self, profile: OptimizationProfile, inputs: List[torch.Tensor]
     ) -> List[torch.Tensor]:
-        default_initializer = lambda shapes, dtype, device: torch.rand(
-            shapes, device=device
+        default_initializer = lambda shapes, dtype, device: (
+            torch.rand(shapes, device=device) * 10 - 5
         ).to(dtype)
         tensors = []
         for i, p in enumerate(profile.shapes):
diff --git a/flashinfer/cascade.py b/flashinfer/cascade.py
index 5281672b39..1de363bb37 100644
--- a/flashinfer/cascade.py
+++ b/flashinfer/cascade.py
@@ -19,6 +19,7 @@
 
 import torch
 
+from .api_logging import flashinfer_api
 from .decode import BatchDecodeWithPagedKVCacheWrapper
 from .jit.cascade import gen_cascade_module
 from .prefill import BatchPrefillWithPagedKVCacheWrapper, single_prefill_with_kv_cache
@@ -30,6 +31,7 @@ def get_cascade_module():
     return gen_cascade_module().build_and_load()
 
 
+@flashinfer_api
 @register_custom_op("flashinfer::merge_state", mutates_args=())
 def merge_state(
     v_a: torch.Tensor, s_a: torch.Tensor, v_b: torch.Tensor, s_b: torch.Tensor
@@ -96,6 +98,7 @@ def _fake_merge_state(
     return v, s
 
 
+@flashinfer_api
 @register_custom_op("flashinfer::merge_state_in_place", mutates_args=("v", "s"))
 def merge_state_in_place(
     v: torch.Tensor,
@@ -156,6 +159,7 @@ def _fake_merge_state_in_place(
     pass
 
 
+@flashinfer_api
 @register_custom_op("flashinfer::merge_states", mutates_args=())
 def merge_states(v: torch.Tensor, s: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     r"""Merge multiple attention states (v, s).
@@ -287,6 +291,7 @@ class MultiLevelCascadeAttentionWrapper:
     BatchPrefillWithPagedKVCacheWrapper
     """
 
+    @flashinfer_api
     def __init__(
         self,
         num_levels,
@@ -349,6 +354,7 @@ def __init__(
                     paged_kv_indptr_buf_arr,
                     paged_kv_indices_buf_arr,
                     paged_kv_last_page_len_buf_arr,
+                    strict=True,
                 )
             ]
         else:
@@ -381,10 +387,11 @@ def reset_workspace_buffer(
             be the same as the device of the input tensors.
         """
         for wrapper, int_workspace_buffer in zip(
-            self._batch_prefill_wrappers, int_workspace_buffers
+            self._batch_prefill_wrappers, int_workspace_buffers, strict=True
         ):
             wrapper.reset_workspace_buffer(float_workspace_buffer, int_workspace_buffer)
 
+    @flashinfer_api
     def plan(
         self,
         qo_indptr_arr: List[torch.Tensor],
@@ -479,6 +486,7 @@ def plan(
                 paged_kv_indptr_arr,
                 paged_kv_indices_arr,
                 paged_kv_last_page_len,
+                strict=True,
             )
         ):
             wrapper.plan(
@@ -504,6 +512,7 @@ def plan(
 
     begin_forward = plan
 
+    @flashinfer_api
     def run(
         self,
         q: torch.Tensor,
@@ -627,6 +636,7 @@ class BatchDecodeWithSharedPrefixPagedKVCacheWrapper:
     manages the lifecycle of these data structures.
     """
 
+    @flashinfer_api
     def __init__(
         self, float_workspace_buffer: torch.Tensor, kv_layout: str = "NHD"
     ) -> None:
@@ -654,6 +664,7 @@ def reset_workspace_buffer(
             float_workspace_buffer, int_workspace_buffer
         )
 
+    @flashinfer_api
     def begin_forward(
         self,
         unique_kv_indptr: torch.Tensor,
@@ -715,6 +726,7 @@ def begin_forward(
             data_type=data_type,
         )
 
+    @flashinfer_api
     def forward(
         self,
         q: torch.Tensor,
@@ -778,6 +790,7 @@ def forward(
         merge_state_in_place(V_shared, S_shared, V_unique, S_unique)
         return V_shared
 
+    @flashinfer_api
     def end_forward(self) -> None:
         r"""Warning: this function is deprecated and has no effect"""
         pass
@@ -874,6 +887,7 @@ class BatchPrefillWithSharedPrefixPagedKVCacheWrapper:
     layers). This wrapper class manages the lifecycle of these data structures.
     """
 
+    @flashinfer_api
     def __init__(
         self, float_workspace_buffer: torch.Tensor, kv_layout: str = "NHD"
     ) -> None:
@@ -912,6 +926,7 @@ def reset_workspace_buffer(
             float_workspace_buffer, int_workspace_buffer
         )
 
+    @flashinfer_api
     def begin_forward(
         self,
         qo_indptr: torch.Tensor,
@@ -967,6 +982,7 @@ def begin_forward(
             page_size,
         )
 
+    @flashinfer_api
     def forward(
         self,
         q: torch.Tensor,
@@ -1058,6 +1074,7 @@ def forward(
         merge_state_in_place(V_shared, S_shared, V_unique, S_unique)
         return V_shared
 
+    @flashinfer_api
     def end_forward(self) -> None:
         r"""Warning: this function is deprecated and has no effect"""
         pass
diff --git a/flashinfer/comm/mnnvl.py b/flashinfer/comm/mnnvl.py
index 2e98efd1e2..2d280a68e8 100644
--- a/flashinfer/comm/mnnvl.py
+++ b/flashinfer/comm/mnnvl.py
@@ -155,6 +155,9 @@ def Get_size(self) -> int: ...
     @abstractmethod
     def allgather(self, data: int) -> List[int]: ...
 
+    @abstractmethod
+    def barrier(self) -> None: ...
+
     @abstractmethod
     def Split(self, color: int, key: int) -> "CommBackend": ...
 
@@ -209,6 +212,9 @@ def Get_size(self) -> int:
     def allgather(self, data: int) -> List[int]:
         return self._mpicomm.allgather(data)
 
+    def barrier(self):
+        self._mpicomm.Barrier()
+
     def Split(self, color: int, key: int) -> CommBackend:
         self._mpicomm = self._mpicomm.Split(color, key)
         return MPIBackend()  # Returns new adapter
@@ -414,7 +420,7 @@ def open_mnnvl_memory(mapping: Mapping, size: int):
                 pidfds.append(pidfd)
 
             remote_fds = []
-            for pidfd, fd in zip(pidfds, all_handles_data):
+            for pidfd, fd in zip(pidfds, all_handles_data, strict=True):
                 remote_fd = syscall(SYS_pidfd_getfd, pidfd, fd, 0)
                 if remote_fd < 0:
                     err = ctypes.get_errno()
@@ -555,6 +561,7 @@ def __init__(
         group_rank: int,
         device_idx: int,
         is_multi_node: bool = True,
+        comm_backend_for_handle_transfer: Optional[CommBackend] = None,
     ):
         cu_device = checkCudaErrors(cuda.cuDeviceGet(device_idx))
 
@@ -631,7 +638,7 @@ def __init__(
                     "[McastDeviceMemory] Device does not support fabric handle."
                 )
 
-            self._alloc_mn_mcast_mem(buf_size)
+            self._alloc_mn_mcast_mem(buf_size, comm_backend_for_handle_transfer)
         else:
             # For single-node NVLS, would need to implement _alloc_nvls_mcast_mem
             raise NotImplementedError("Single-node NVLS allocation not implemented yet")
@@ -753,7 +760,9 @@ def get_world_size(self) -> int:
         """Get the total number of devices in the group"""
         return self.group_size
 
-    def _alloc_mn_mcast_mem(self, buf_size: int):
+    def _alloc_mn_mcast_mem(
+        self, buf_size: int, comm_backend_for_handle_transfer: Any = None
+    ):
         """Allocate multi-node multicast memory using MNNVL"""
 
         # Verify CUDA context
@@ -766,10 +775,10 @@ def _alloc_mn_mcast_mem(self, buf_size: int):
                 )
         except Exception as e:
             print(f"Error checking CUDA context: {e}")
-
-        # Get MPI communicator
-        comm = MpiComm()
-
+        if comm_backend_for_handle_transfer is None:
+            comm = MpiComm()
+        else:
+            comm = comm_backend_for_handle_transfer
         # Set up allocation properties
         handle_type = cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_FABRIC
 
@@ -969,6 +978,7 @@ def __init__(
         group_rank: int,
         device: torch.device,
         mn_nvlink: bool = True,
+        comm_backend_for_handle_transfer: Optional[CommBackend] = None,
     ):
         """
         Constructor for McastGpuBuffer.
@@ -979,9 +989,15 @@ def __init__(
             group_rank: The rank of the local process within the group
             device: The CUDA device for buffer allocation
             mn_nvlink: Flag indicating if multi-node NVLink is used
+            comm_backend_for_handle_transfer: Communication backend for handle transfer
         """
         self.mcast_device_memory = McastDeviceMemory(
-            buf_size, group_size, group_rank, device.index, mn_nvlink
+            buf_size,
+            group_size,
+            group_rank,
+            device.index,
+            mn_nvlink,
+            comm_backend_for_handle_transfer,
         )
         self.buf_size = buf_size
         self.local_device = device
diff --git a/flashinfer/comm/trtllm_alltoall.py b/flashinfer/comm/trtllm_alltoall.py
index 114b85b30c..b6a57f41c2 100644
--- a/flashinfer/comm/trtllm_alltoall.py
+++ b/flashinfer/comm/trtllm_alltoall.py
@@ -236,7 +236,9 @@ def moe_prepare(
         )
         if scales is not None:
             prepared_local_scales = torch.empty(
-                (max_token_count_per_rank * ep_size, top_k), **attrs
+                (max_token_count_per_rank * ep_size, top_k),
+                dtype=torch.float32,
+                device=attrs["device"],
             )
         else:
             prepared_local_scales = None
diff --git a/flashinfer/comm/trtllm_mnnvl_ar.py b/flashinfer/comm/trtllm_mnnvl_ar.py
index d8d975db73..84a9c150de 100644
--- a/flashinfer/comm/trtllm_mnnvl_ar.py
+++ b/flashinfer/comm/trtllm_mnnvl_ar.py
@@ -15,7 +15,7 @@
 
 from ..jit import gen_trtllm_mnnvl_comm_module
 from ..utils import register_custom_op
-from .mnnvl import McastGPUBuffer
+from .mnnvl import McastGPUBuffer, CommBackend
 
 
 def mpi_barrier():
@@ -122,7 +122,10 @@ def trtllm_mnnvl_rmsnorm(
 
 
 def get_allreduce_mnnvl_workspace(
-    mapping: Mapping, dtype: torch.dtype
+    mapping: Mapping,
+    dtype: torch.dtype,
+    comm_backend_for_handle_transfer: Optional[CommBackend] = None,
+    buffer_size_in_bytes: Optional[int] = None,
 ) -> Tuple[McastGPUBuffer, torch.Tensor, int]:
     """Get workspace buffers needed for multi-node NVLink all-reduce operation.
 
@@ -138,6 +141,8 @@ def get_allreduce_mnnvl_workspace(
     Args:
         mapping: Tensor parallel mapping configuration containing rank info
         dtype: Data type of the tensors being reduced
+        comm: Optional communication backend for multi-node synchronization
+        buffer_size_in_bytes: Optional buffer size. Practically, assign this to 3 * 2 * dtype.itemsize * hidden_dim * max_tokens
 
     Returns:
         Tuple containing:
@@ -152,7 +157,9 @@ def get_allreduce_mnnvl_workspace(
     # LCM for hidden_dim: 2048, 4096, 5120, 7168, 8192 = 286720
     # max_num_elements must be a multiple of 286720
     lcm_hidden_dim = 286720
-    TARGET_WORKSPACE_SIZE_BYTES = 12_000_000
+    TARGET_WORKSPACE_SIZE_BYTES = (
+        buffer_size_in_bytes if buffer_size_in_bytes is not None else 12_000_000
+    )
     buffer_size_in_bytes = math.ceil(
         TARGET_WORKSPACE_SIZE_BYTES / (lcm_hidden_dim * stride)
     ) * (lcm_hidden_dim * stride)
@@ -164,6 +171,7 @@ def get_allreduce_mnnvl_workspace(
         mapping.tp_rank,
         torch.device("cuda", mapping.local_rank),
         mapping.is_multi_node() or force_mn,
+        comm_backend_for_handle_transfer=comm_backend_for_handle_transfer,
     )
 
     # Initialize the unicast buffer with -0.0
@@ -171,7 +179,10 @@ def get_allreduce_mnnvl_workspace(
 
     # CPU barrier since we assume this should not be called in cuda graph
     torch.cuda.synchronize()
-    mpi_barrier()
+    if comm_backend_for_handle_transfer is None:
+        mpi_barrier()
+    else:
+        comm_backend_for_handle_transfer.barrier()
 
     # This is a buffer to maintain the state of this allreduce Op
     # [Buffer_ptr, Clear_ptr, Buffer_size, num_tokens_prev, atomic access counter]
@@ -223,6 +234,17 @@ def trtllm_mnnvl_all_reduce(
         [Optional] out: Output tensor to store the result (required if wait_for_results is True)
 
     """
+
+    if len(inp.shape) != 2:
+        raise ValueError(
+            f"The input tensor must be 2D, got {len(inp.shape)}D. The shape is {inp.shape}."
+        )
+
+    if inp.shape[0] > buffer_M:
+        raise ValueError(
+            f"The number of tokens in the input tensor {inp.shape[0]} is greater than the buffer_M {buffer_M}. This is not supported. Please increase the workspace size, or decrease the amount of tokens to at most {buffer_M}."
+        )
+
     module = get_trtllm_mnnvl_comm_module()
     module.trtllm_mnnvl_all_reduce(
         inp,
diff --git a/flashinfer/cudnn/decode.py b/flashinfer/cudnn/decode.py
index 6ef13b997f..195ca2d49d 100644
--- a/flashinfer/cudnn/decode.py
+++ b/flashinfer/cudnn/decode.py
@@ -3,6 +3,7 @@
 
 import torch
 
+from ..api_logging import flashinfer_api
 from .utils import get_cudnn_fmha_gen_module
 
 try:
@@ -252,6 +253,7 @@ def _batch_decode_with_kv_cache(
     return out
 
 
+@flashinfer_api
 def cudnn_batch_decode_with_kv_cache(
     q: torch.Tensor,
     k_cache: torch.Tensor,
diff --git a/flashinfer/cudnn/prefill.py b/flashinfer/cudnn/prefill.py
index fc573cf7cb..b8c09a66ee 100644
--- a/flashinfer/cudnn/prefill.py
+++ b/flashinfer/cudnn/prefill.py
@@ -3,6 +3,7 @@
 
 import torch
 
+from ..api_logging import flashinfer_api
 from .utils import get_cudnn_fmha_gen_module
 
 try:
@@ -383,6 +384,7 @@ def _batch_prefill_with_kv_cache(
         return out, None
 
 
+@flashinfer_api
 def cudnn_batch_prefill_with_kv_cache(
     q: torch.Tensor,
     k_cache: torch.Tensor,
diff --git a/flashinfer/cute_dsl/blockscaled_gemm.py b/flashinfer/cute_dsl/blockscaled_gemm.py
index d69eda2743..78a888dac6 100644
--- a/flashinfer/cute_dsl/blockscaled_gemm.py
+++ b/flashinfer/cute_dsl/blockscaled_gemm.py
@@ -154,6 +154,7 @@ def __new_from_mlir_values__(self, values):
                 self._cluster_shape_mnk,
             ],
             self._values_pos,
+            strict=True,
         ):
             obj_list.append(new_from_mlir_values(obj, values[:n_items]))
             values = values[n_items:]
@@ -348,6 +349,7 @@ def _get_current_work_for_linear_idx(
                 cur_cluster_coord,
                 self.cta_id_in_cluster,
                 (*self.params.cluster_shape_mn, Int32(1)),
+                strict=True,
             )
         )
 
@@ -527,8 +529,9 @@ def __init__(
         :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
         :type cluster_shape_mn: Tuple[int, int]
         """
-        assert sm_version == "sm_100", (
-            "sm_100 is the only supported SM version for cute-dsl backend."
+        supported_sm_versions = ["sm_100", "sm_103"]
+        assert sm_version in supported_sm_versions, (
+            f"{supported_sm_versions} are the only supported SM versions for cute-dsl backend, but encountered {sm_version}"
         )
 
         self.acc_dtype = cutlass.Float32
diff --git a/flashinfer/cute_dsl/gemm_allreduce_two_shot.py b/flashinfer/cute_dsl/gemm_allreduce_two_shot.py
index 4a31cf2c3f..baf55468a4 100644
--- a/flashinfer/cute_dsl/gemm_allreduce_two_shot.py
+++ b/flashinfer/cute_dsl/gemm_allreduce_two_shot.py
@@ -8,9 +8,11 @@
 import cutlass.utils as utils
 import cutlass.pipeline as pipeline
 import cutlass.utils.blackwell_helpers as sm100_utils
-import cutlass.utils.distributed_helpers as distributed_helpers
+import cutlass.utils.distributed as distributed
+
 from cutlass.cute.nvgpu import cpasync, tcgen05
 from cutlass.cute.typing import (
+    Pointer,
     Int32,
     Float16,
     BFloat16,
@@ -20,6 +22,85 @@
 )
 
 
+def spin_lock_multimem_arrive(lock_ptr: Pointer, loc=None, ip=None) -> None:
+    """
+    arrive a spin lock when the lock_ptr is a multimem address.
+    """
+    distributed.multimem_red_relaxed_gpu_add1(lock_ptr, loc=loc, ip=ip)
+
+
+# HACK https://github.com/NVIDIA/cutlass/issues/2845
+from cutlass._mlir.dialects import nvvm
+from cutlass.cutlass_dsl import T
+from cutlass._mlir.dialects.nvvm import (
+    MemOrderKind,
+    MemScopeKind,
+    AtomicOpKind,
+)
+
+
+@cute.jit
+def spin_lock_atom_cas_acquire_wait(
+    lock_ptr: Pointer,
+    *,
+    expected_val: Int32,
+    reset_val: Int32,
+    scope: str,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    wait on a spin lock until the expected count is reached. Reset flag to reset_val if the expected count is reached.
+    """
+    if scope == "gpu":
+        result = 0
+        while result != expected_val:
+            result = nvvm.atomicrmw(
+                T.i32(),
+                AtomicOpKind.CAS,
+                lock_ptr.llvm_ptr,
+                Int32(reset_val).ir_value(loc=loc, ip=ip),
+                b=Int32(expected_val).ir_value(loc=loc, ip=ip),
+                mem_order=MemOrderKind.ACQUIRE,
+                syncscope=MemScopeKind.GPU,
+                loc=loc,
+                ip=ip,
+            )
+    elif scope == "sys":
+        result = 0
+        while result != expected_val:
+            result = nvvm.atomicrmw(
+                T.i32(),
+                AtomicOpKind.CAS,
+                lock_ptr.llvm_ptr,
+                Int32(reset_val).ir_value(loc=loc, ip=ip),
+                b=Int32(expected_val).ir_value(loc=loc, ip=ip),
+                mem_order=MemOrderKind.ACQUIRE,
+                syncscope=MemScopeKind.SYS,
+                loc=loc,
+                ip=ip,
+            )
+
+
+def sm_wise_inter_gpu_multimem_barrier(
+    barrier: Pointer, barrier_mc: Pointer, num_ranks, loc=None, ip=None
+) -> None:
+    """
+    barrier for inter-gpu sm-wise
+    """
+    bidx, bidy, bidz = cute.arch.block_idx()
+    bdimx, bdimy, _ = cute.arch.grid_dim()
+    pid = bidx + bidy * bdimx + bidz * bdimx * bdimy
+    distributed.multimem_red_release_sys_add1(barrier_mc + pid, loc=loc, ip=ip)
+    cute.arch.fence_proxy(cute.arch.ProxyKind.alias)
+
+    # v4.3.1 does not have mem_order="acquire" variant in `distributed` module
+    # filed issue https://github.com/NVIDIA/cutlass/issues/2845
+    spin_lock_atom_cas_acquire_wait(
+        barrier + pid, expected_val=num_ranks, reset_val=0, scope="sys", loc=loc, ip=ip
+    )
+
+
 """
 A high-performance distributed persistent batched dense GEMM example for the NVIDIA Blackwell SM100 architecture
 using CUTE DSL.
@@ -162,6 +243,7 @@ def __init__(
         cluster_shape_mn: Tuple[int, int],
         use_tma_store: bool,
         all_reduce="none",
+        sm_version="sm_100",
     ):
         """Initializes the configuration for a Blackwell dense GEMM kernel.
 
@@ -235,7 +317,7 @@ def __init__(
         self.epilog_sync_bar_id = 1
         self.tmem_ptr_sync_bar_id = 2
         self.all_reduce_sync_bar_id = 3
-        self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+        self.smem_capacity = utils.get_smem_capacity_in_bytes(sm_version)
 
         self.num_ranks = 1
         self.rank_id = 0
@@ -253,8 +335,6 @@ def is_valid(self):
             return False
         return True
 
-        self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
-
     def _setup_attributes(self):
         """Set up configurations that are dependent on GEMM inputs
 
@@ -1231,7 +1311,7 @@ def kernel(
                         with cute.arch.elect_one():
                             flag = barrier_flag_mc.iterator + tile_id
                             cute.arch.fence_acq_rel_gpu()
-                            distributed_helpers.spin_lock_multimem_arrive(flag)
+                            spin_lock_multimem_arrive(flag)
                             cute.arch.fence_proxy(cute.arch.ProxyKind.alias)
 
                 #
@@ -1321,7 +1401,9 @@ def kernel(
                         with cute.arch.elect_one():
                             flag = barrier_flag.iterator + tile_id
                             # TODO: we may use LDG+STG for spin lock instead of ATOMIC_CAS for better performance.
-                            distributed_helpers.spin_lock_wait(flag, num_ranks)
+                            distributed.spin_lock_atom_cas_relaxed_wait(
+                                flag, expected_val=num_ranks, reset_val=0, scope="gpu"
+                            )
 
                     cute.arch.barrier(
                         barrier_id=self.all_reduce_sync_bar_id,
@@ -1356,32 +1438,26 @@ def kernel(
                             mc_ptr = frgC_mc[None, i, j].iterator
                             x, y, z, w = 0, 0, 0, 0
                             if cutlass.const_expr(self.c_dtype == Float16):
-                                x, y, z, w = (
-                                    distributed_helpers.multimem_ld_reduce_8xf16(mc_ptr)
+                                x, y, z, w = distributed.multimem_ld_reduce_8xf16(
+                                    mc_ptr
                                 )
                             elif cutlass.const_expr(self.c_dtype == Float32):
-                                x, y, z, w = (
-                                    distributed_helpers.multimem_ld_reduce_4xf32(mc_ptr)
+                                x, y, z, w = distributed.multimem_ld_reduce_4xf32(
+                                    mc_ptr
                                 )
                             elif cutlass.const_expr(self.c_dtype == BFloat16):
-                                x, y, z, w = (
-                                    distributed_helpers.multimem_ld_reduce_8xbf16(
-                                        mc_ptr
-                                    )
+                                x, y, z, w = distributed.multimem_ld_reduce_8xbf16(
+                                    mc_ptr
                                 )
                             elif cutlass.const_expr(self.c_dtype == Float8E4M3FN):
-                                x, y, z, w = (
-                                    distributed_helpers.multimem_ld_reduce_16xe4m3(
-                                        mc_ptr
-                                    )
+                                x, y, z, w = distributed.multimem_ld_reduce_16xe4m3(
+                                    mc_ptr
                                 )
                             elif cutlass.const_expr(self.c_dtype == Float8E5M2):
-                                x, y, z, w = (
-                                    distributed_helpers.multimem_ld_reduce_16xe5m2(
-                                        mc_ptr
-                                    )
+                                x, y, z, w = distributed.multimem_ld_reduce_16xe5m2(
+                                    mc_ptr
                                 )
-                            distributed_helpers.multimem_st_4xb32(mc_ptr, x, y, z, w)
+                            distributed.multimem_st_4xb32(mc_ptr, x, y, z, w)
                     # Advance to next tile
                     tile_sched.advance_to_next_work()
                     work_tile = tile_sched.get_current_work()
@@ -1396,7 +1472,7 @@ def kernel(
                 ) * cute.size(self.cluster_shape_mn)
                 if warp_idx == self.all_reduce_warp_id[0]:
                     with cute.arch.elect_one():
-                        distributed_helpers.sm_wise_inter_gpu_multimem_barrier(
+                        sm_wise_inter_gpu_multimem_barrier(
                             barrier_flag.iterator + last_flag_idx,
                             barrier_flag_mc.iterator + last_flag_idx,
                             self.num_ranks,
diff --git a/flashinfer/decode.py b/flashinfer/decode.py
index 467152af38..cc865ae5f8 100644
--- a/flashinfer/decode.py
+++ b/flashinfer/decode.py
@@ -21,6 +21,14 @@
 
 import torch
 
+from .api_logging import flashinfer_api
+
+## NOTE: MLA functions have been moved to mla.py, but we keep the aliases here for backward compatibility.
+from .mla import (
+    trtllm_batch_decode_with_kv_cache_mla as trtllm_batch_decode_with_kv_cache_mla,
+    xqa_batch_decode_with_kv_cache_mla as xqa_batch_decode_with_kv_cache_mla,
+)
+from .xqa import xqa, xqa_mla as xqa_mla
 from .cudnn import cudnn_batch_decode_with_kv_cache as cudnn_batch_decode_with_kv_cache
 from .jit import (
     gen_batch_decode_mla_module,
@@ -41,6 +49,7 @@
     get_single_prefill_module,
 )
 from .utils import (
+    log2e,
     FP4Tensor,
     MaskMode,
     PosEncodingMode,
@@ -310,6 +319,7 @@ def get_trtllm_gen_fmha_module():
     return op
 
 
+@flashinfer_api
 def single_decode_with_kv_cache_with_jit_module(
     jit_module: Any,
     q: torch.Tensor,
@@ -386,6 +396,7 @@ def single_decode_with_kv_cache(
 ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
 
+@flashinfer_api
 def single_decode_with_kv_cache(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -644,6 +655,7 @@ class BatchDecodeWithPagedKVCacheWrapper:
     manages the lifecycle of these data structures.
     """
 
+    @flashinfer_api
     def __init__(
         self,
         float_workspace_buffer: torch.Tensor,
@@ -807,6 +819,7 @@ def reset_workspace_buffer(
             pin_memory=True,
         )
 
+    @flashinfer_api
     def plan(
         self,
         indptr: torch.Tensor,
@@ -897,16 +910,6 @@ def plan(
 
         The :meth:`plan` method cannot be used in Cuda Graph or in ``torch.compile``.
         """
-        for tensor, name in [
-            (indptr, "indptr"),
-            (indices, "indices"),
-            (last_page_len, "last_page_len"),
-        ]:
-            if tensor.dtype != torch.int32:
-                raise ValueError(
-                    f"{name} must have dtype torch.int32, got {tensor.dtype}"
-                )
-
         self._workspace_size = (
             self._float_workspace_buffer.numel()
             * self._float_workspace_buffer.element_size()
@@ -983,7 +986,6 @@ def plan(
         else:
             kv_lens_arr_host = seq_lens.cpu()
         if self._backend == "trtllm-gen":
-            assert self._kv_layout == "HND"
             assert logits_soft_cap == 0.0
             self._max_kv_len = max(kv_lens_arr_host).item()
             self._kv_lens_buffer[: len(kv_lens_arr_host)].copy_(
@@ -1060,6 +1062,7 @@ def plan(
                 window_left,
                 fixed_split_size,
                 disable_split_kv,
+                0,  # num_colocated_ctas
             )
         else:
             if self._jit_module is not None:
@@ -1160,6 +1163,7 @@ def run(
         window_left: Optional[int] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
+    @flashinfer_api
     def run(
         self,
         q: torch.Tensor,
@@ -1226,6 +1230,7 @@ def run(
         if enable_pdl is None:
             enable_pdl = device_support_pdl(q.device)
         k_cache, v_cache = _unpack_paged_kv_cache(paged_kv_cache, self._kv_layout)
+
         if self._kv_layout == "NHD":
             page_size = k_cache.shape[1]
         else:
@@ -1234,6 +1239,12 @@ def run(
             q, k_cache, self._cached_q_data_type, self._cached_kv_data_type
         )
 
+        # Convert NHD layout to HND for trtllm-gen backend
+        if self._backend == "trtllm-gen" and self._kv_layout == "NHD":
+            # For NHD: [..., N, H, D] -> HND: [..., H, N, D]
+            k_cache = k_cache.transpose(-3, -2)
+            v_cache = v_cache.transpose(-3, -2)
+
         pos_encoding_mode = self._pos_encoding_mode
         window_left = self._window_left if window_left is None else window_left
         if self._backend != "trtllm-gen":
@@ -1486,6 +1497,7 @@ class BatchDecodeMlaWithPagedKVCacheWrapper:
     a more efficient and general MLA implementation that supports decode and incremental prefill.
     """
 
+    @flashinfer_api
     def __init__(
         self,
         float_workspace_buffer: torch.Tensor,
@@ -1600,6 +1612,7 @@ def reset_workspace_buffer(
             pin_memory=True,
         )
 
+    @flashinfer_api
     def plan(
         self,
         indptr: torch.Tensor,
@@ -1656,16 +1669,6 @@ def plan(
         :meth:`run_return_lse` calls, auxiliary data structures will be created
         during this call and cached for multiple run calls.
         """
-        for tensor, name in [
-            (indptr, "indptr"),
-            (indices, "indices"),
-            (last_page_len, "last_page_len"),
-        ]:
-            if tensor.dtype != torch.int32:
-                raise ValueError(
-                    f"{name} must have dtype torch.int32, got {tensor.dtype}"
-                )
-
         batch_size = len(last_page_len)
         if logits_soft_cap is None:
             logits_soft_cap = 0.0
@@ -1725,6 +1728,7 @@ def plan(
         self._rope_scale = rope_scale
         self._rope_theta = rope_theta
 
+    @flashinfer_api
     def run(
         self,
         q_nope: torch.Tensor,
@@ -1872,8 +1876,8 @@ def _paged_run(
         block_tables: torch.Tensor,
         seq_lens: torch.Tensor,
         max_seq_len: int,
-        bmm1_scale: float,  # todo(Yingyi): add dynamic scale tensor later
-        bmm2_scale: float,
+        bmm1_scale: Union[float, torch.Tensor],
+        bmm2_scale: Union[float, torch.Tensor],
         workspace_size: int,
         window_left: int = -1,
         enable_pdl: bool = None,
@@ -1885,12 +1889,11 @@ def _paged_run(
         if self._sm_count is None:
             self._sm_count = get_device_sm_count(query.device)
 
-        bmm1_scale = (
-            bmm1_scale.item() if isinstance(bmm1_scale, torch.Tensor) else bmm1_scale
-        )
-        bmm2_scale = (
-            bmm2_scale.item() if isinstance(bmm2_scale, torch.Tensor) else bmm2_scale
-        )
+        if isinstance(bmm1_scale, torch.Tensor):
+            assert bmm1_scale.dtype == torch.float32
+            bmm1_scale = bmm1_scale * log2e
+        if isinstance(bmm2_scale, torch.Tensor):
+            assert bmm2_scale.dtype == torch.float32
 
         self._op.trtllm_paged_attention_decode(
             out,
@@ -1908,6 +1911,7 @@ def _paged_run(
             -1,  # o_sf_vec_size
             0,  # o_sf_start_index
             window_left,
+            0,  # sparse_mla_top_k
             self._sm_count,
             enable_pdl,
             workspace_size,
@@ -1988,7 +1992,7 @@ def paged_run(
             q.contiguous(),  # NOTE(Siyuan): without contiguous, the result is incorrect
             paged_k_cache,
             paged_v_cache,
-            int_workspace_buffer,
+            float_workspace_buffer,
             block_tables,
             kv_lens_buffer,
             max_kv_len,
@@ -2051,6 +2055,7 @@ def _fake_paged_run(
     )
 
 
+@flashinfer_api
 def trtllm_batch_decode_with_kv_cache(
     query: torch.Tensor,
     kv_cache: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
@@ -2058,16 +2063,20 @@ def trtllm_batch_decode_with_kv_cache(
     block_tables: torch.Tensor,
     seq_lens: torch.Tensor,
     max_seq_len: int,
-    bmm1_scale: float,
-    bmm2_scale: float,  # todo(Yingyi): add dynamic scale tensor later
+    bmm1_scale: Union[float, torch.Tensor] = 1.0,
+    bmm2_scale: Union[float, torch.Tensor] = 1.0,
     window_left: int = -1,
     out: Optional[Union[torch.Tensor, FP4Tensor]] = None,
     out_dtype: Optional[Union[torch.dtype, str]] = None,
     o_sf_scale: Optional[float] = None,
     o_sf_vec_size: Optional[int] = None,
     sinks: Optional[List[torch.Tensor]] = None,
-    enable_pdl: bool = None,
+    kv_layout: str = "HND",
+    enable_pdl: Optional[bool] = None,
+    backend: str = "auto",
     q_len_per_req: Optional[int] = 1,
+    o_scale: Optional[float] = 1.0,
+    mask: Optional[torch.Tensor] = None,
 ) -> Union[torch.Tensor, FP4Tensor]:
     """
     Parameters
@@ -2076,8 +2085,11 @@ def trtllm_batch_decode_with_kv_cache(
         query tensor with shape [num_tokens, num_heads, head_dim], num_tokens = batch_size * q_len_per_request
 
     kv_cache : Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
-        If kv_cache is a single tensor, it should be a tensor with shape [num_pages, 1 or 2, num_kv_heads, page_size, head_dim]
-        If kv_cache is a tuple of two tensors, it should be a tuple of two tensors with shape [num_pages, num_kv_heads, page_size, head_dim]
+        If kv_cache is a single tensor, it should be a tensor with shape [num_pages, 1 or 2, num_kv_heads, page_size, head_dim] if :attr:`kv_layout` is ``HND``,
+        or [num_pages, 1 or 2, page_size, num_kv_heads, head_dim] if :attr:`kv_layout` is ``NHD``.
+        If kv_cache is a tuple of two tensors, it should be a tuple of two tensors with shape [num_pages, num_kv_heads, page_size, head_dim] if :attr:`kv_layout` is ``HND``,
+        or [num_pages, page_size, num_kv_heads, head_dim] if :attr:`kv_layout` is ``NHD``.
+        The first tensor is the key cache, and the second tensor is the value cache.
 
     workspace_buffer : torch.Tensor. Must be initialized to 0 for its first use.
         workspace
@@ -2091,11 +2103,13 @@ def trtllm_batch_decode_with_kv_cache(
     max_seq_len : int
         max sequence length for kv_cache
 
-    bmm1_scale : float
+    bmm1_scale : Union[float, torch.Tensor]
         fused scale for bmm1 input.
+        when using trtllm-gen backend, it can be a torch.Tensor with dtype torch.float32.
 
-    bmm2_scale : float
+    bmm2_scale : Union[float, torch.Tensor]
         fused scale for bmm2 input.
+        when using trtllm-gen backend, it can be a torch.Tensor with dtype torch.float32.
 
     window_left : int = -1
         The left (inclusive) window size for the attention window, when set to ``-1``, the window
@@ -2116,9 +2130,25 @@ def trtllm_batch_decode_with_kv_cache(
     sinks : Optional[List[torch.Tensor]] = None
         additional value per head in the denominator of the softmax.
 
-    enable_pdl : bool
+    kv_layout : str = "HND"
+        The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.
+        Defaults to ``HND``.
+
+    enable_pdl : Optional[bool] = None
         Whether to enable Programmatic Dependent Launch (PDL). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#programmatic-dependent-launch-and-synchronization
-        Only supported for >= sm90, and currently only for FA2, CUDA core, and trtllm-gen decode.
+        When set to ``None``, the backend will be chosen based on the device architecture and kernel availability.
+
+    backend : str = "auto"
+        The implementation backend, could be ``auto``/``xqa`` or ``trtllm-gen``. Defaults to ``auto``.
+        When set to ``auto``, the backend will be chosen based on the device architecture and kernel availability.
+        For sm_100 and sm_103 (blackwell architecture), ``auto`` will choose ``trtllm-gen`` backend.
+        For sm_90 (hopper architecture) and sm_120 (blackwell architecture), ``auto`` will choose ``xqa`` backend.
+
+    o_scale : Optional[float] = 1.0
+        output scale factor for xqa fp8 output.
+
+    mask : Optional[torch.Tensor] = None
+        causal attention mask for xqa speculative decoding.
 
     Returns
     -------
@@ -2140,276 +2170,318 @@ def trtllm_batch_decode_with_kv_cache(
             # it doesn't change underlying storage
             k_cache, v_cache = kv_cache.unbind(dim=1)
 
-    run_func = get_trtllm_gen_fmha_module().trtllm_paged_attention_decode
-    sm_count = get_device_sm_count(query.device)
-
-    if out_dtype == "nvfp4" or (out_dtype is None and isinstance(out, FP4Tensor)):
-        assert query.dtype == torch.float8_e4m3fn, (
-            "query must be fp8 when out_dtype is nvfp4."
+    if backend == "auto":
+        backend = (
+            "trtllm-gen" if get_compute_capability(query.device)[0] == 10 else "xqa"
         )
-        assert o_sf_scale is not None
-        assert o_sf_vec_size in [None, 16], "only o_sf_vec_size = 16 is supported"
-        o_sf_vec_size = o_sf_vec_size or 16
 
-        fp4_out_shape = query.shape[:-1] + (ceil_div(query.shape[-1], 2),)
+    if backend == "xqa":
+        # xqa backend doesn't support nvfp4 output
+        if out_dtype == "nvfp4" or (out_dtype is None and isinstance(out, FP4Tensor)):
+            raise ValueError("xqa backend does not support nvfp4 output")
+        if o_sf_scale is not None or o_sf_vec_size is not None:
+            raise ValueError("xqa backend does not support o_sf_scale or o_sf_vec_size")
 
-        if isinstance(out, FP4Tensor):
-            fp4_out_scale_shape = (
-                out.scale.shape[0],
-                round_up(query.shape[1] * query.shape[2] // o_sf_vec_size, 4),
-            )
-            out_scale_factor = out.scale
-            o_sf_start_index = out.scale_start_index
-            out = out.data
-            # out_dtype may be None
-            out_dtype = out_dtype or "nvfp4"
-        elif out is None:
-            fp4_out_scale_shape = (
-                round_up(query.shape[0], 128),
-                round_up(query.shape[1] * query.shape[2] // o_sf_vec_size, 4),
-            )
-            out_scale_factor = torch.empty(
-                fp4_out_scale_shape, dtype=torch.float8_e4m3fn, device=query.device
+        # Handle out and out_dtype
+        if out_dtype is None:
+            out_dtype = out.dtype if out is not None else query.dtype
+        if out is None:
+            out = torch.empty_like(query, dtype=out_dtype)
+
+        # Call xqa_batch_decode_with_kv_cache
+        return xqa_batch_decode_with_kv_cache(
+            query=query,
+            kv_cache=(k_cache, v_cache),
+            workspace_buffer=workspace_buffer,
+            block_tables=block_tables,
+            seq_lens=seq_lens,
+            max_seq_len=max_seq_len,
+            bmm1_scale=bmm1_scale,
+            bmm2_scale=bmm2_scale,
+            window_left=window_left,
+            out=out,
+            sinks=sinks,
+            kv_layout=kv_layout,
+            enable_pdl=enable_pdl,
+            q_len_per_req=q_len_per_req,
+            o_scale=o_scale,
+            mask=mask,
+        )
+    elif backend == "trtllm-gen":
+        # Convert NHD layout to HND if necessary (transpose only changes stride, not data)
+        if kv_layout == "NHD":
+            # For NHD: [..., N, H, D] -> HND: [..., H, N, D]
+            k_cache = k_cache.transpose(-3, -2)
+            v_cache = v_cache.transpose(-3, -2)
+
+        run_func = get_trtllm_gen_fmha_module().trtllm_paged_attention_decode
+        sm_count = get_device_sm_count(query.device)
+
+        if out_dtype == "nvfp4" or (out_dtype is None and isinstance(out, FP4Tensor)):
+            assert query.dtype == torch.float8_e4m3fn, (
+                "query must be fp8 when out_dtype is nvfp4."
             )
-            o_sf_start_index = 0
-            out = torch.empty(fp4_out_shape, dtype=torch.uint8, device=query.device)
-        else:
-            raise ValueError(f"Invalid out: {out}")
+            assert o_sf_scale is not None
+            assert o_sf_vec_size in [None, 16], "only o_sf_vec_size = 16 is supported"
+            o_sf_vec_size = o_sf_vec_size or 16
 
-        assert out_dtype == "nvfp4"
-        assert isinstance(out, torch.Tensor)
+            fp4_out_shape = query.shape[:-1] + (ceil_div(query.shape[-1], 2),)
 
-        # Use uint8 as the container dtype to compliant with next fp4 gemm.
-        check_shape_dtype_device(out, fp4_out_shape, torch.uint8, query.device, "out")
+            if isinstance(out, FP4Tensor):
+                fp4_out_scale_shape = (
+                    out.scale.shape[0],
+                    round_up(query.shape[1] * query.shape[2] // o_sf_vec_size, 4),
+                )
+                out_scale_factor = out.scale
+                o_sf_start_index = out.scale_start_index
+                out = out.data
+                # out_dtype may be None
+                out_dtype = out_dtype or "nvfp4"
+            elif out is None:
+                fp4_out_scale_shape = (
+                    round_up(query.shape[0], 128),
+                    round_up(query.shape[1] * query.shape[2] // o_sf_vec_size, 4),
+                )
+                out_scale_factor = torch.empty(
+                    fp4_out_scale_shape, dtype=torch.float8_e4m3fn, device=query.device
+                )
+                o_sf_start_index = 0
+                out = torch.empty(fp4_out_shape, dtype=torch.uint8, device=query.device)
+            else:
+                raise ValueError(f"Invalid out: {out}")
 
-        check_shape_dtype_device(
-            out_scale_factor,
-            fp4_out_scale_shape,
-            torch.float8_e4m3fn,
-            query.device,
-            "out_scale_factor",
-        )
+            assert out_dtype == "nvfp4"
+            assert isinstance(out, torch.Tensor)
 
-        # Check o_sf_start_index is valid
-        if (
-            o_sf_start_index < 0
-            or o_sf_start_index + out.shape[0] > out_scale_factor.shape[0]
-        ):
-            raise ValueError(
-                f"o_sf_start_index is out of the valid range of out_scale_factor. "
-                f"o_sf_start_index={o_sf_start_index}, out.shape[0]={out.shape[0]}, "
-                f"out_scale_factor.shape[0]={out_scale_factor.shape[0]}"
+            # Use uint8 as the container dtype to compliant with next fp4 gemm.
+            check_shape_dtype_device(
+                out, fp4_out_shape, torch.uint8, query.device, "out"
             )
 
-    elif isinstance(out_dtype, torch.dtype) or out_dtype is None:
-        assert o_sf_scale is None
-        assert o_sf_vec_size is None
-        out_scale_factor = None
-        o_sf_start_index = 0
-        if out_dtype is None:
-            out_dtype = out.dtype if out is not None else query.dtype
-        out = out if out is not None else torch.empty_like(query, dtype=out_dtype)
-        if out_dtype not in (query.dtype, torch.float16, torch.bfloat16):
-            raise ValueError(f"Unsupported out_dtype: {out_dtype}")
-        check_shape_dtype_device(out, query.shape, out_dtype, query.device, "out")
-    else:
-        raise ValueError(f"Invalid out_dtype: {out_dtype}")
-
-    bmm1_scale = (
-        bmm1_scale.item() if isinstance(bmm1_scale, torch.Tensor) else bmm1_scale
-    )
-    bmm2_scale = (
-        bmm2_scale.item() if isinstance(bmm2_scale, torch.Tensor) else bmm2_scale
-    )
+            check_shape_dtype_device(
+                out_scale_factor,
+                fp4_out_scale_shape,
+                torch.float8_e4m3fn,
+                query.device,
+                "out_scale_factor",
+            )
 
-    run_func(
-        out,
-        out_scale_factor,
-        query.view(
-            query.size(0) // q_len_per_req, q_len_per_req, query.size(1), query.size(2)
-        ),
-        k_cache,
-        v_cache,
-        workspace_buffer,
-        block_tables,
-        seq_lens,
-        max_seq_len,
-        bmm1_scale,
-        bmm2_scale,
-        o_sf_scale or -1.0,
-        o_sf_vec_size or -1,
-        o_sf_start_index,
-        window_left,
-        sm_count,
-        enable_pdl,
-        workspace_buffer.numel() * workspace_buffer.element_size(),
-        sinks,
-    )
+            # Check o_sf_start_index is valid
+            if (
+                o_sf_start_index < 0
+                or o_sf_start_index + out.shape[0] > out_scale_factor.shape[0]
+            ):
+                raise ValueError(
+                    f"o_sf_start_index is out of the valid range of out_scale_factor. "
+                    f"o_sf_start_index={o_sf_start_index}, out.shape[0]={out.shape[0]}, "
+                    f"out_scale_factor.shape[0]={out_scale_factor.shape[0]}"
+                )
 
-    return (
-        out
-        if out_dtype != "nvfp4"
-        else FP4Tensor(out, out_scale_factor, o_sf_start_index, query.shape)
-    )
+        elif isinstance(out_dtype, torch.dtype) or out_dtype is None:
+            assert o_sf_scale is None
+            assert o_sf_vec_size is None
+            out_scale_factor = None
+            o_sf_start_index = 0
+            if out_dtype is None:
+                out_dtype = out.dtype if out is not None else query.dtype
+            out = out if out is not None else torch.empty_like(query, dtype=out_dtype)
+            if out_dtype not in (query.dtype, torch.float16, torch.bfloat16):
+                raise ValueError(f"Unsupported out_dtype: {out_dtype}")
+            check_shape_dtype_device(out, query.shape, out_dtype, query.device, "out")
+        else:
+            raise ValueError(f"Invalid out_dtype: {out_dtype}")
 
+        if isinstance(bmm1_scale, torch.Tensor):
+            assert bmm1_scale.dtype == torch.float32
+            bmm1_scale = bmm1_scale * log2e
+        if isinstance(bmm2_scale, torch.Tensor):
+            assert bmm2_scale.dtype == torch.float32
 
-def _check_trtllm_gen_mla_shape(
-    query,
-    kv_cache,
-    qk_nope_head_dim,
-    kv_lora_rank,
-    qk_rope_head_dim,
-    page_table,
-    page_size,
-):
-    if query.ndim != 4:
-        raise ValueError(f"Expected query.ndim == 4, got {query.ndim}")
-    if kv_cache.ndim != 4:
-        raise ValueError(f"Expected kv_cache.ndim == 4, got {kv_cache.ndim}")
-    if qk_nope_head_dim != 128:
-        raise ValueError(f"Expected qk_nope_head_dim == 128, got {qk_nope_head_dim}")
-    if kv_lora_rank != 512:
-        raise ValueError(f"Expected kv_lora_rank == 512, got {kv_lora_rank}")
-    if qk_rope_head_dim != 64:
-        raise ValueError(f"Expected qk_rope_head_dim == 64, got {qk_rope_head_dim}")
-
-    B_q, Q_len, H, D_q = query.shape
-    D_ckv = kv_cache.shape[3]
-    # if H != 128:
-    #     raise ValueError(f"Expected 128 heads for query, got {H}")
-    # todo(Yingyi): should we check num_heads == 128? Is this deepseek only?
-    if D_q != D_ckv or D_q != 576:
-        raise ValueError(
-            f"Expected head dim 576 for query and kv_cache, got {D_q} and {D_ckv}"
+        run_func(
+            out,
+            out_scale_factor,
+            query.view(
+                query.size(0) // q_len_per_req,
+                q_len_per_req,
+                query.size(1),
+                query.size(2),
+            ),
+            k_cache,
+            v_cache,
+            workspace_buffer,
+            block_tables,
+            seq_lens,
+            max_seq_len,
+            bmm1_scale,
+            bmm2_scale,
+            o_sf_scale or -1.0,
+            o_sf_vec_size or -1,
+            o_sf_start_index,
+            window_left,
+            0,  # sparse_mla_top_k
+            sm_count,
+            enable_pdl,
+            workspace_buffer.numel() * workspace_buffer.element_size(),
+            sinks,
         )
 
-    B_block_table, block_num = page_table.shape
-    block_size = page_size
-    if B_q != B_block_table:
-        raise ValueError(
-            f"Expected batch size {B_q} for query and block_table, got {B_q} and {B_block_table}"
-        )
-    if block_num % (128 / block_size) != 0:
-        raise ValueError(
-            f"Expected block_num % (128 / block_size) == 0, got {block_num=} and {block_size=}"
+        return (
+            out
+            if out_dtype != "nvfp4"
+            else FP4Tensor(out, out_scale_factor, o_sf_start_index, query.shape)
         )
+    else:
+        raise KeyError(f"Backend {backend} not supported")
 
 
-def trtllm_batch_decode_with_kv_cache_mla(
+# xqa uses NHD layout
+@flashinfer_api
+def xqa_batch_decode_with_kv_cache(
     query: torch.Tensor,
-    kv_cache: torch.Tensor,
+    kv_cache: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
     workspace_buffer: torch.Tensor,
-    qk_nope_head_dim: int,
-    kv_lora_rank: int,
-    qk_rope_head_dim: int,
     block_tables: torch.Tensor,
     seq_lens: torch.Tensor,
     max_seq_len: int,
+    bmm1_scale: Union[float, torch.Tensor] = 1.0,
+    bmm2_scale: Union[float, torch.Tensor] = 1.0,
+    window_left: int = -1,
     out: Optional[torch.Tensor] = None,
-    bmm1_scale: Optional[float] = 1.0,
-    bmm2_scale: Optional[float] = 1.0,
-    bmm1_scale_log2_tensor: Optional[torch.Tensor] = None,
-    bmm2_scale_tensor: Optional[torch.Tensor] = None,
-    sinks: Optional[List[torch.Tensor]] = None,
+    sinks: Optional[torch.Tensor] = None,
+    kv_layout: str = "NHD",
     enable_pdl: bool = None,
+    q_len_per_req: Optional[int] = 1,
+    o_scale: Optional[float] = 1.0,
+    mask: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
-    Parameters:
-    query: [batch_size, q_len_per_request, num_heads, head_dim_qk], head_dim_qk = qk_nope_head_dim (kv_lora_rank) + qk_rope_head_dim, should be concated q_nope + q_rope; q_len_per_request is the MTP query length.
-    kv_cache: [num_pages, page_size, head_dim_ckv + head_dim_kpe], should be concated ckv_cache + kpe_cache
-    workspace_buffer: [num_semaphores, 4], used for multi_block mode. Must be initialized to 0 for its first use.
-    qk_nope_head_dim: qk_nope_head_dim, must be 128
-    kv_lora_rank: kv_lora_rank, must be 512
-    qk_rope_head_dim: qk_rope_head_dim, must be 64
-    block_tables: page_table of kv cache, [batch_size, num_pages]
-    seq_lens: query_len
-    max_seq_len: max sequence length for kv_cache
-    out: output tensor, if not provided, will be allocated internally
-    bmm1_scale: fused scale for mla bmm1 input.
-    bmm2_scale: fused scale for mla bmm2 input.
-    bmm1_scale_log2_tensor: On-device fused scale tensor for mla bmm1 input. Must be fused with * M_LOG2E before passing in.
-    bmm2_scale_tensor: On-device fused scale tensor for mla bmm2 input.
-    sinks: additional value per head in the denominator of the softmax.
-
-    Note:
-    In MLA, the actual BMM1 and BMM2 scales applied would be fused as:
-    bmm1_scale = q_scale * k_scale * sm_scale / (head_dim_qk ** 0.5)
-    bmm2_scale = v_scale * o_scale
-    or,
-    bmm1_scale_log2_tensor = [q_scale * k_scale * sm_scale / (head_dim_qk ** 0.5) * M_LOG2E]
-    bmm2_scale_tensor = [v_scale * o_scale]
-
-    The two scale factors should be static constant for cuda graph capture.
-    Either (bmm1_scale, bmm2_scale) or (bmm1_scale_log2_tensor, bmm2_scale_tensor) should be provided.
-
-    For static constant scale factors, the scale factors should be provided as float.
-        - (bmm1_scale, bmm2_scale)
-    For on-device fused scale tensors, which could dynamically change, the scale factors should be provided as torch.Tensor.
-        - (bmm1_scale_log2_tensor, bmm2_scale_tensor)
-        - Currently, only fp8 tensor core operation supports this mode.
-    When both are provided, the dynamic scale factor tensors will be used.
+    Parameters
+    ----------
+    query : torch.Tensor
+        query tensor with shape [num_tokens, num_heads, head_dim], num_tokens = batch_size * q_len_per_request
+
+    kv_cache : Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
+        If kv_cache is a single tensor, it should be a tensor with shape [num_pages, 1 or 2, page_size, num_kv_heads, head_dim] if :attr:`kv_layout` is ``NHD``,
+        or [num_pages, 1 or 2, num_kv_heads, page_size, head_dim] if :attr:`kv_layout` is ``HND``.
+        If kv_cache is a tuple of two tensors, it should be a tuple of two tensors with shape [num_pages, page_size, num_kv_heads, head_dim] if :attr:`kv_layout` is ``NHD``,
+        or [num_pages, num_kv_heads, page_size, head_dim] if :attr:`kv_layout` is ``HND``.
+
+    workspace_buffer : torch.Tensor. Must be initialized to 0 for its first use.
+        workspace
+
+    block_tables : torch.Tensor
+        page_table of kv cache, [batch_size, num_pages]
+
+    seq_lens : torch.Tensor
+        A uint32 1D tensor indicating the kv sequence length of each prompt. shape: ``[batch_size]``
+
+    max_seq_len : int
+        max sequence length for kv_cache
+
+    bmm1_scale : Union[float, torch.Tensor]
+        fused scale for bmm1 input.
+
+    bmm2_scale : Union[float, torch.Tensor]
+        fused scale for bmm2 input.
+
+    window_left : int = -1
+        The left (inclusive) window size for the attention window, when set to ``-1``, the window
+        size will be set to the full length of the sequence. Defaults to ``-1``.
+
+    out :  Optional[torch.Tensor] = None
+        output tensor, if not provided, will be allocated with ``query.dtype``.
+
+    sinks : Optional[torch.Tensor] = None
+        additional value per head in the denominator of the softmax.
+
+    kv_layout : str
+        The layout of the kv cache. Can be either ``NHD`` or ``HND``. Defaults to ``NHD``.
+
+    enable_pdl : bool
+        Whether to enable Programmatic Dependent Launch (PDL). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#programmatic-dependent-launch-and-synchronization
+        Only supported for >= sm90, and currently only for FA2, CUDA core, and trtllm-gen decode.
+
+    o_scale : Optional[float] = 1.0
+        output scale factor for fp8 output.
+
+    mask : Optional[torch.Tensor] = None
+        causal attention mask for xqa speculative decoding.
+
+    Returns
+    -------
+    out : torch.Tensor
+        output torch.Tensor.
     """
     enable_pdl = device_support_pdl(query.device) if enable_pdl is None else enable_pdl
-    run_func = get_trtllm_gen_fmha_module().trtllm_paged_attention_decode
-    sm_count = get_device_sm_count(query.device)
-
-    block_size = kv_cache.size(-2)
-    if (
-        block_size != 32 and block_size != 64
-    ):  # todo(Yingyi): add support for more block sizes?
-        raise ValueError(f"Supported block_size are 32 and 64, got {block_size}")
-
-    _check_trtllm_gen_mla_shape(
-        query,
-        kv_cache,
-        qk_nope_head_dim,
-        kv_lora_rank,
-        qk_rope_head_dim,
-        block_tables,
-        block_size,
-    )
 
-    if out is None:
-        out_shape = query.shape[:-1] + (kv_lora_rank,)
-        out = torch.empty(out_shape, dtype=torch.bfloat16, device=query.device)
+    if isinstance(kv_cache, tuple):
+        k_cache, v_cache = kv_cache
     else:
-        batch_size, _, num_q_heads, _ = query.shape
-        check_shape_dtype_device(
-            out,
-            [batch_size, num_q_heads, kv_lora_rank],
-            torch.bfloat16,
-            query.device,
-            "out",
-        )
-
-    if bmm1_scale_log2_tensor is not None and bmm2_scale_tensor is not None:
-        # dynamic scale factors
-        if query.dtype != torch.float8_e4m3fn or kv_cache.dtype != torch.float8_e4m3fn:
-            raise ValueError(
-                "Dynamic scale factors bmm1_scale_tensor and bmm2_scale_tensor are only supported for fp8 tensor core operation"
+        if kv_cache.shape[1] == 1:
+            k_cache, v_cache = kv_cache, kv_cache
+        else:
+            assert kv_cache.shape[1] == 2, (
+                "When kv_cache is a single tensor, the second dimension must be 1 or 2"
             )
+            # NOTE(Zihao): unbind transforms [num_pages, 2, ...] to ([num_pages, ...], [num_pages, ...])
+            # it doesn't change underlying storage
+            k_cache, v_cache = kv_cache.unbind(dim=1)
 
-    run_func(
-        out,
-        None,  # fp4 output not supported in wrapper api yet.
-        query,
-        kv_cache,
-        kv_cache,
-        workspace_buffer,
+    sm_count = get_device_sm_count(query.device)
+
+    # Extract shape parameters based on layout
+    if kv_layout == "NHD":
+        # NHD: [num_pages, page_size, num_kv_heads, head_dim]
+        page_size = k_cache.shape[1]
+        num_kv_heads = k_cache.shape[2]
+        head_dim = k_cache.shape[3]
+    else:  # HND
+        # HND: [num_pages, num_kv_heads, page_size, head_dim]
+        num_kv_heads = k_cache.shape[1]
+        page_size = k_cache.shape[2]
+        head_dim = k_cache.shape[3]
+
+    workspace_u8 = workspace_buffer.view(torch.uint8)
+    semaphore = workspace_u8[: 8 * 1024 * 1024]  # reserve 8MB for semaphore
+    scratch = workspace_u8[8 * 1024 * 1024 :]
+    kv_scale_value = bmm2_scale * o_scale
+    q_scale_value = bmm1_scale / kv_scale_value * (head_dim**0.5)
+
+    if q_len_per_req > 1:
+        batch_size = query.shape[0] // q_len_per_req
+        query = query.view(batch_size, q_len_per_req, query.shape[1], query.shape[2])
+    query_new = query.unsqueeze(1)
+    seq_lens_new = seq_lens.unsqueeze(1)
+    sinks_new = sinks.reshape(num_kv_heads, -1) if sinks is not None else None
+
+    # Ensure 4D output for xqa
+    if out is None:
+        out = torch.empty_like(query)
+    out_4d = out.unsqueeze(1)
+
+    xqa(
+        query_new,
+        k_cache,
+        v_cache,
         block_tables,
-        seq_lens,
-        max_seq_len,
-        bmm1_scale,
-        bmm2_scale,
-        -1,  # o_sf_scale
-        -1,  # o_sf_vec_size
-        0,  # o_sf_start_index
-        -1,  # window_left
-        sm_count,
-        enable_pdl,
-        workspace_buffer.numel() * workspace_buffer.element_size(),
-        sinks,
+        seq_lens_new,
+        out_4d,
+        scratch,
+        semaphore,
+        num_kv_heads,
+        page_size,
+        sinks=sinks_new,
+        q_scale=q_scale_value,
+        kv_scale=kv_scale_value,
+        sliding_win_size=window_left + 1 if window_left >= 0 else 0,
+        kv_layout=kv_layout,
+        sm_count=sm_count,
+        enable_pdl=enable_pdl,
+        rcp_out_scale=1.0 / o_scale,
+        q_seq_len=q_len_per_req,
+        mask=mask,
     )
+
     return out
 
 
@@ -2525,7 +2597,7 @@ def fast_decode_plan(
             kv_lens_arr_host = get_seq_lens(indptr_host, last_page_len_host, page_size)
 
             try:
-                # Make sure we pass exactly 15 arguments for tensor core version
+                # Make sure we pass exactly 16 arguments for tensor core version
                 self._plan_info = self._cached_module.plan(
                     self._float_workspace_buffer,
                     self._int_workspace_buffer,
@@ -2545,6 +2617,7 @@ def fast_decode_plan(
                     window_left,
                     fixed_split_size,
                     disable_split_kv,
+                    0,  # num_colocated_ctas
                 )
             except Exception as e:
                 raise RuntimeError(f"Error in standard plan: {e}") from e
diff --git a/flashinfer/deep_gemm.py b/flashinfer/deep_gemm.py
index 4da91750fd..48a6d4a5c0 100644
--- a/flashinfer/deep_gemm.py
+++ b/flashinfer/deep_gemm.py
@@ -41,11 +41,16 @@
 
 import torch
 
-from .artifacts import ArtifactPath, MetaInfoHash
+from .artifacts import ArtifactPath
 from .cuda_utils import checkCudaErrors
 from .jit.cubin_loader import get_cubin
 from .jit.env import FLASHINFER_CUBIN_DIR
-from .utils import ceil_div, round_up
+from .utils import (
+    ceil_div,
+    round_up,
+    supported_compute_capability,
+    backend_requirement,
+)
 
 
 class GemmType(enum.Enum):
@@ -1358,24 +1363,27 @@ def m_grouped_fp8_gemm_nt_masked_sm10x(
     runtime(**all_kwargs)
 
 
-def m_grouped_fp8_gemm_nt_contiguous(
+@supported_compute_capability([100, 103])
+def _check_group_deepgemm_fp8_nt_contiguous_problem_size(
     a_fp8: Tuple[torch.Tensor, torch.Tensor],
     b_fp8: Tuple[torch.Tensor, torch.Tensor],
     d: torch.Tensor,
     m_indices: torch.Tensor,
     recipe: Optional[Tuple[int, int, int]] = None,
     compiled_dims: str = "nk",
-) -> None:
-    # Compiled dims can be upper cases
-    compiled_dims = compiled_dims.lower()
-
+) -> bool:
     # NOTES: shape must be `[M, K] @ [G, N, K].mT`
     major_a = get_major_type_ab(a_fp8[0])
     major_b = get_major_type_ab(b_fp8[0])
-    assert major_a == MajorTypeAB.KMajor
-    if must_be_k_major():
-        assert major_b == MajorTypeAB.KMajor
-    assert m_indices.is_contiguous()
+    if major_a != MajorTypeAB.KMajor:
+        raise ValueError(f"major_a must be KMajor, but got {major_a}")
+    if must_be_k_major() and (major_b != MajorTypeAB.KMajor):
+        raise ValueError(f"major_b must be KMajor, but got {major_b}")
+
+    if not m_indices.is_contiguous():
+        raise ValueError(
+            f"m_indices must be contiguous, but got {m_indices.is_contiguous()}"
+        )
 
     a, sfa = a_fp8
     b, sfb = b_fp8
@@ -1385,15 +1393,48 @@ def m_grouped_fp8_gemm_nt_contiguous(
     m__ = m_indices.numel()
 
     # Type and shape checks
-    assert m == m_ == m__ and n == n_ and k == k_
-    assert n > 0 and k > 0 and num_groups > 0
-    assert a.dtype == torch.float8_e4m3fn
-    assert b.dtype == torch.float8_e4m3fn
-    assert d.dtype == torch.bfloat16
-    assert m_indices.dtype == torch.int32
+    if m != m_ or k != k_ or n != n_ or m__ != m_ or num_groups != m__:
+        raise ValueError(
+            f"Shape mismatch. m = {m}, m_ = {m_}, k = {k}, k_ = {k_}, n = {n}, n_ = {n_}, m__ = {m__}"
+        )
+    if a.dtype != torch.float8_e4m3fn:
+        raise ValueError(f"a must be float8_e4m3fn, but got {a.dtype}")
+    if b.dtype != torch.float8_e4m3fn:
+        raise ValueError(f"b must be float8_e4m3fn, but got {b.dtype}")
+    if d.dtype != torch.bfloat16:
+        raise ValueError(f"d must be bfloat16, but got {d.dtype}")
+    if m_indices.dtype != torch.int32:
+        raise ValueError(f"m_indices must be int32, but got {m_indices.dtype}")
 
     # D must be N-major
-    assert get_major_type_cd(d) == MajorTypeCD.NMajor
+    if get_major_type_cd(d) != MajorTypeCD.NMajor:
+        raise ValueError(f"d must be N-major, but got {get_major_type_cd(d)}")
+
+    return True
+
+
+@backend_requirement(
+    {},
+    common_check=_check_group_deepgemm_fp8_nt_contiguous_problem_size,
+)
+def m_grouped_fp8_gemm_nt_contiguous(
+    a_fp8: Tuple[torch.Tensor, torch.Tensor],
+    b_fp8: Tuple[torch.Tensor, torch.Tensor],
+    d: torch.Tensor,
+    m_indices: torch.Tensor,
+    recipe: Optional[Tuple[int, int, int]] = None,
+    compiled_dims: str = "nk",
+) -> None:
+    # Compiled dims can be upper cases
+    compiled_dims = compiled_dims.lower()
+
+    major_a = get_major_type_ab(a_fp8[0])
+    major_b = get_major_type_ab(b_fp8[0])
+
+    a, sfa = a_fp8
+    b, sfb = b_fp8
+    m, k = a.shape
+    num_groups, n, k_ = b.shape
 
     # Do nothing if the problem is empty
     if m == 0:
@@ -1423,6 +1464,72 @@ def m_grouped_fp8_gemm_nt_contiguous(
     impl(a, sfa, b, sfb, d, m_indices)
 
 
+@supported_compute_capability([100, 103])
+def _check_m_grouped_fp8_gemm_nt_masked_problem_size(
+    a_fp8: Tuple[torch.Tensor, torch.Tensor],
+    b_fp8: Tuple[torch.Tensor, torch.Tensor],
+    d: torch.Tensor,
+    masked_m: torch.Tensor,
+    expected_m: int,
+    recipe: Optional[Tuple[int, int, int]] = None,
+    compiled_dims: str = "nk",
+) -> bool:
+    major_a = get_major_type_ab(a_fp8[0])
+    major_b = get_major_type_ab(b_fp8[0])
+    if major_a != MajorTypeAB.KMajor:
+        raise ValueError(f"major_a must be KMajor, but got {major_a}")
+    if major_b != MajorTypeAB.KMajor:
+        raise ValueError(f"major_b must be KMajor, but got {major_b}")
+
+    if not masked_m.is_contiguous():
+        raise ValueError(
+            f"masked_m must be contiguous, but got {masked_m.is_contiguous()}"
+        )
+
+    a, sfa = a_fp8
+    b, sfb = b_fp8
+    num_groups, m, k = a.shape
+    num_groups_, n, k_ = b.shape
+    num_groups__, m_, n_ = d.shape
+    num_groups___ = masked_m.numel()
+
+    # Type and shape checks
+    if (
+        num_groups != num_groups_
+        or num_groups != num_groups__
+        or num_groups != num_groups___
+    ):
+        raise ValueError(
+            f"num_groups mismatch. num_groups = {num_groups}, num_groups_ = {num_groups_}, num_groups__ = {num_groups__}, num_groups___ = {num_groups___}"
+        )
+    if m != m_ or n != n_ or k != k_:
+        raise ValueError(
+            f"m, n, k mismatch. m = {m}, m_ = {m_}, n = {n}, n_ = {n_}, k = {k}, k_ = {k_}"
+        )
+    if expected_m <= 0 or m <= 0 or n <= 0 or k <= 0 or num_groups <= 0:
+        raise ValueError(
+            f"expected_m, m, n, k, num_groups must be greater than 0, but got expected_m = {expected_m}, m = {m}, n = {n}, k = {k}, num_groups = {num_groups}"
+        )
+    if a.dtype != torch.float8_e4m3fn:
+        raise ValueError(f"a must be float8_e4m3fn, but got {a.dtype}")
+    if b.dtype != torch.float8_e4m3fn:
+        raise ValueError(f"b must be float8_e4m3fn, but got {b.dtype}")
+    if d.dtype != torch.bfloat16:
+        raise ValueError(f"d must be bfloat16, but got {d.dtype}")
+    if masked_m.dtype != torch.int32:
+        raise ValueError(f"masked_m must be int32, but got {masked_m.dtype}")
+
+    # D must be N-major
+    if get_major_type_cd(d) != MajorTypeCD.NMajor:
+        raise ValueError(f"d must be N-major, but got {get_major_type_cd(d)}")
+
+    return True
+
+
+@backend_requirement(
+    {},
+    common_check=_check_m_grouped_fp8_gemm_nt_masked_problem_size,
+)
 def m_grouped_fp8_gemm_nt_masked(
     a_fp8: Tuple[torch.Tensor, torch.Tensor],
     b_fp8: Tuple[torch.Tensor, torch.Tensor],
@@ -1445,20 +1552,6 @@ def m_grouped_fp8_gemm_nt_masked(
     b, sfb = b_fp8
     num_groups, m, k = a.shape
     num_groups_, n, k_ = b.shape
-    num_groups__, m_, n_ = d.shape
-    num_groups___ = masked_m.numel()
-
-    # Type and shape checks
-    assert num_groups == num_groups_ == num_groups__ == num_groups___
-    assert m == m_ and n == n_ and k == k_
-    assert expected_m > 0 and m > 0 and n > 0 and k > 0 and num_groups > 0
-    assert a.dtype == torch.float8_e4m3fn
-    assert b.dtype == torch.float8_e4m3fn
-    assert d.dtype == torch.bfloat16
-    assert masked_m.dtype == torch.int32
-
-    # D must be N-major
-    assert get_major_type_cd(d) == MajorTypeCD.NMajor
 
     # Transform SFA and SFB into compute-required layout
     recipe = get_default_recipe(sfa.dtype, sfb.dtype) if recipe is None else recipe
@@ -1487,13 +1580,15 @@ def m_grouped_fp8_gemm_nt_masked(
 
 
 class KernelMap:
-    def __init__(self, sha256: str):
-        self.sha256 = sha256
+    # Hash for kernel_map.json, updated when deepgemm cubins are republished
+    KERNEL_MAP_HASH = "f161e031826adb8c4f0d31ddbd2ed77e4909e4e43cdfc9728918162a62fcccfb"
+
+    def __init__(self):
         self.indice = None
 
     def init_indices(self):
         indice_path = ArtifactPath.DEEPGEMM + "/" + "kernel_map.json"
-        assert get_cubin(indice_path, self.sha256), (
+        assert get_cubin(indice_path, self.KERNEL_MAP_HASH), (
             "cubin kernel map file not found, nor downloaded with matched sha256"
         )
         path = FLASHINFER_CUBIN_DIR / indice_path
@@ -1513,4 +1608,4 @@ def __getitem__(self, key):
         return self.indice[key]
 
 
-KERNEL_MAP = KernelMap(MetaInfoHash.DEEPGEMM)
+KERNEL_MAP = KernelMap()
diff --git a/flashinfer/dsv3_ops/__init__.py b/flashinfer/dsv3_ops/__init__.py
new file mode 100644
index 0000000000..527f4f3d9b
--- /dev/null
+++ b/flashinfer/dsv3_ops/__init__.py
@@ -0,0 +1,7 @@
+from flashinfer.gemm import mm_M1_16_K7168_N256
+from flashinfer.fused_moe import fused_topk_deepseek
+
+__all__ = [
+    "mm_M1_16_K7168_N256",
+    "fused_topk_deepseek",
+]
diff --git a/flashinfer/fp4_quantization.py b/flashinfer/fp4_quantization.py
index 29127f06ac..7a2e0bde6f 100644
--- a/flashinfer/fp4_quantization.py
+++ b/flashinfer/fp4_quantization.py
@@ -21,6 +21,7 @@
 
 import torch
 
+from .api_logging import flashinfer_api
 from .jit import JitSpec
 from .jit import env as jit_env
 from .jit import (
@@ -264,10 +265,10 @@ def block_scale_interleave_sm100(
         """Swizzle block scale tensor for FP4 format.
 
         Args:
-            unswizzled_sf (torch.Tensor): unswizzled block scale tensor with dtype uint8.
+            unswizzled_sf (torch.Tensor): unswizzled block scale tensor with dtype uint8 or bfloat16.
 
         Returns:
-            torch.Tensor: output tensor for swizzled block scale with dtype uint8.
+            torch.Tensor: output tensor for swizzled block scale with dtype uint8 or bfloat16.
         """
         num_experts = unswizzled_sf.shape[0] if unswizzled_sf.dim() == 3 else 1
         expert_out_size = _compute_swizzled_layout_sf_size(
@@ -275,7 +276,7 @@ def block_scale_interleave_sm100(
         )
         out = torch.empty(
             (num_experts * expert_out_size,),
-            dtype=torch.uint8,
+            dtype=unswizzled_sf.dtype,
             device=unswizzled_sf.device,
         )
         module.block_scale_interleave_sm100(unswizzled_sf, out)
@@ -624,6 +625,7 @@ def _fake_e2m1_and_ufp8sf_scale_to_float_sm100(
     )
 
 
+@flashinfer_api
 def fp4_quantize(
     input: torch.Tensor,
     global_scale: Optional[torch.Tensor] = None,
@@ -689,6 +691,7 @@ def fp4_quantize(
     return x_q, sf
 
 
+@flashinfer_api
 def block_scale_interleave(unswizzled_sf: torch.Tensor) -> torch.Tensor:
     """Swizzle block scale tensor for FP4 format.
 
@@ -696,18 +699,18 @@ def block_scale_interleave(unswizzled_sf: torch.Tensor) -> torch.Tensor:
     for FP4 operations. The output needs to be padded in the m dimension to be a multiple of 128.
 
     Args:
-        unswizzled_sf (torch.Tensor): Input tensor with dtype uint8.
+        unswizzled_sf (torch.Tensor): Input tensor with dtype uint8 or bfloat16.
 
     Returns:
         torch.Tensor: Swizzled tensor with the same shape as input.
 
     Raises:
-        AssertionError: If input dtype is not uint8.
+        AssertionError: If input dtype is not uint8 or bfloat16.
     """
     # TODO(shuw): check input dtype is uint8
-    assert unswizzled_sf.dtype == torch.uint8, (
-        f"Input dtype must be uint8, got {unswizzled_sf.dtype}"
-    )
+    assert (
+        unswizzled_sf.dtype == torch.uint8 or unswizzled_sf.dtype == torch.bfloat16
+    ), f"Input dtype must be uint8 or bfloat16, got {unswizzled_sf.dtype}"
 
     major, minor = get_compute_capability(unswizzled_sf.device)
     device_arch = f"{major * 10 + minor}"
@@ -721,6 +724,7 @@ def block_scale_interleave(unswizzled_sf: torch.Tensor) -> torch.Tensor:
 nvfp4_block_scale_interleave = block_scale_interleave
 
 
+@flashinfer_api
 def e2m1_and_ufp8sf_scale_to_float(
     e2m1_tensor: torch.Tensor,
     ufp8_scale_tensor: torch.Tensor,
@@ -763,6 +767,7 @@ def e2m1_and_ufp8sf_scale_to_float(
     )
 
 
+@flashinfer_api
 def shuffle_matrix_a(input_tensor: torch.Tensor, epilogue_tile_m: int) -> torch.Tensor:
     """
     PyTorch equivalent of trtllm-gen `shuffleMatrixA`
@@ -772,6 +777,7 @@ def shuffle_matrix_a(input_tensor: torch.Tensor, epilogue_tile_m: int) -> torch.
     return input_tensor[row_indices.to(input_tensor.device)]
 
 
+@flashinfer_api
 def shuffle_matrix_sf_a(
     input_tensor: torch.Tensor,
     epilogue_tile_m: int,
@@ -806,6 +812,7 @@ class SfLayout(Enum):
     layout_linear = 2
 
 
+@flashinfer_api
 def nvfp4_quantize(
     a,
     a_global_sf,
@@ -866,6 +873,7 @@ def nvfp4_quantize(
     return a_fp4, a_sf
 
 
+@flashinfer_api
 def mxfp4_quantize(a):
     """
     Quantize input tensor to MXFP4 format.
@@ -883,6 +891,7 @@ def mxfp4_quantize(a):
     return a_fp4, a_sf
 
 
+@flashinfer_api
 def mxfp4_dequantize(a_fp4, a_sf):
     """
     Dequantize input tensor from MXFP4 format.
@@ -904,6 +913,7 @@ def mxfp4_dequantize(a_fp4, a_sf):
     )
 
 
+@flashinfer_api
 def mxfp4_dequantize_host(
     weight: torch.Tensor,
     scale: torch.Tensor,
@@ -932,6 +942,7 @@ def mxfp4_dequantize_host(
     )
 
 
+@flashinfer_api
 def nvfp4_batched_quantize(
     a,
     a_global_sf,
@@ -961,6 +972,7 @@ def nvfp4_batched_quantize(
     return a_fp4, a_sf
 
 
+@flashinfer_api
 def scaled_fp4_grouped_quantize(
     a,
     mask,
diff --git a/flashinfer/fp8_quantization.py b/flashinfer/fp8_quantization.py
index 07db59b681..1d2cdeea76 100644
--- a/flashinfer/fp8_quantization.py
+++ b/flashinfer/fp8_quantization.py
@@ -4,6 +4,7 @@
 
 import torch
 
+from .api_logging import flashinfer_api
 from .jit.fp8_quantization import gen_mxfp8_quantization_sm100_module
 from .utils import (
     device_support_pdl,
@@ -142,6 +143,7 @@ def _fake_mxfp8_dequantize_host_sm100(
     )
 
 
+@flashinfer_api
 def mxfp8_quantize(
     input: torch.Tensor,
     is_sf_swizzled_layout: bool = True,
@@ -178,6 +180,7 @@ def mxfp8_quantize(
     return x_q, sf
 
 
+@flashinfer_api
 def mxfp8_dequantize_host(
     input: torch.Tensor,
     scale_tensor: torch.Tensor,
diff --git a/flashinfer/fused_moe/__init__.py b/flashinfer/fused_moe/__init__.py
index 2759105691..a34d37f149 100644
--- a/flashinfer/fused_moe/__init__.py
+++ b/flashinfer/fused_moe/__init__.py
@@ -21,6 +21,7 @@
     convert_to_block_layout,
     cutlass_fused_moe,
     gen_cutlass_fused_moe_sm120_module,
+    gen_cutlass_fused_moe_sm103_module,
     gen_cutlass_fused_moe_sm100_module,
     gen_cutlass_fused_moe_sm90_module,
     gen_trtllm_gen_fused_moe_sm100_module,
@@ -29,6 +30,12 @@
     trtllm_fp4_block_scale_routed_moe,
     trtllm_fp8_block_scale_moe,
     trtllm_fp8_per_tensor_scale_moe,
+    trtllm_bf16_moe,
+    trtllm_mxint4_block_scale_moe,
+)
+
+from .fused_routing_dsv3 import (  # noqa: F401
+    fused_topk_deepseek as fused_topk_deepseek,
 )
 
 __all__ = [
@@ -38,10 +45,16 @@
     "convert_to_block_layout",
     "cutlass_fused_moe",
     "gen_cutlass_fused_moe_sm120_module",
+    "gen_cutlass_fused_moe_sm103_module",
     "gen_cutlass_fused_moe_sm100_module",
     "gen_cutlass_fused_moe_sm90_module",
+    "gen_trtllm_gen_fused_moe_sm100_module",
     "reorder_rows_for_gated_act_gemm",
+    "trtllm_bf16_moe",
     "trtllm_fp4_block_scale_moe",
+    "trtllm_fp4_block_scale_routed_moe",
     "trtllm_fp8_block_scale_moe",
     "trtllm_fp8_per_tensor_scale_moe",
+    "trtllm_mxint4_block_scale_moe",
+    "fused_topk_deepseek",
 ]
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
index 5f0e33ccf9..faab034cc4 100644
--- a/flashinfer/fused_moe/core.py
+++ b/flashinfer/fused_moe/core.py
@@ -18,9 +18,9 @@
 from enum import IntEnum
 from types import SimpleNamespace
 from typing import Any, Dict, List, Optional, Tuple, Union
-
 import torch
 
+from ..api_logging import flashinfer_api
 from ..autotuner import (
     AutoTuner,
     DynamicTensorSpec,
@@ -35,6 +35,7 @@
 )
 from ..jit.fused_moe import (
     gen_cutlass_fused_moe_sm120_module,
+    gen_cutlass_fused_moe_sm103_module,
     gen_cutlass_fused_moe_sm100_module,
     gen_cutlass_fused_moe_sm90_module,
     gen_cutlass_fused_moe_sm89_module,
@@ -45,9 +46,9 @@
     device_support_pdl,
     get_shuffle_matrix_a_row_indices,
     get_shuffle_matrix_sf_a_row_indices,
-    calculate_tile_tokens_dim,
     register_custom_op,
     register_fake_op,
+    get_compute_capability,
 )
 from .utils import (
     get_last_power_of_2_num_tokens_buckets,
@@ -115,13 +116,14 @@ def __new__(cls, block_format_bit, signed_bit, integer_bit, num_bits, uid):
     Int64 = (0, 1, 1, 64, 11)
     MxE2m1 = (1, 1, 0, 4, 12)
     MxE4m3 = (1, 1, 0, 8, 13)
-    UE8m0 = (0, 0, 0, 8, 14)
-    UInt8 = (0, 0, 1, 8, 15)
-    UInt16 = (0, 0, 1, 16, 16)
-    UInt32 = (0, 0, 1, 32, 17)
-    UInt64 = (0, 0, 1, 64, 18)
-    UInt128 = (0, 0, 1, 128, 19)
-    Void = (0, 1, 0, 0, 20)
+    MxInt4 = (1, 1, 1, 4, 14)
+    UE8m0 = (0, 0, 0, 8, 15)
+    UInt8 = (0, 0, 1, 8, 16)
+    UInt16 = (0, 0, 1, 16, 17)
+    UInt32 = (0, 0, 1, 32, 18)
+    UInt64 = (0, 0, 1, 64, 19)
+    UInt128 = (0, 0, 1, 128, 20)
+    Void = (0, 1, 0, 0, 21)
 
 
 def trtllm_gen_dtype_has_scale(dtype: DtypeTrtllmGen) -> bool:
@@ -130,6 +132,7 @@ def trtllm_gen_dtype_has_scale(dtype: DtypeTrtllmGen) -> bool:
         DtypeTrtllmGen.E2m1,
         DtypeTrtllmGen.MxE2m1,
         DtypeTrtllmGen.MxE4m3,
+        DtypeTrtllmGen.MxInt4,
     ]:
         return True
     else:
@@ -179,6 +182,40 @@ class GatedActType(IntEnum):
     GeGlu = 1
 
 
+@functools.cache
+def is_trtllm_moe_supported(
+    dtype_weights: DtypeTrtllmGen,
+    dtype_act: DtypeTrtllmGen,
+    quant_method: Optional[str] = None,
+) -> bool:
+    arch = get_compute_capability(torch.cuda.current_device())
+    if arch[0] < 10:
+        return False
+    if dtype_weights not in [
+        DtypeTrtllmGen.Bfloat16,
+        DtypeTrtllmGen.E4m3,
+        DtypeTrtllmGen.E2m1,
+        DtypeTrtllmGen.MxE2m1,
+    ]:
+        return False
+    if (
+        dtype_weights == DtypeTrtllmGen.Bfloat16
+        and dtype_act != DtypeTrtllmGen.Bfloat16
+    ):
+        return False
+    if dtype_weights == DtypeTrtllmGen.E4m3 and dtype_act != DtypeTrtllmGen.E4m3:
+        return False
+    if dtype_weights == DtypeTrtllmGen.E2m1 and dtype_act != DtypeTrtllmGen.E2m1:
+        return False
+    if dtype_weights == DtypeTrtllmGen.MxE2m1 and dtype_act not in [
+        DtypeTrtllmGen.MxE2m1,
+        DtypeTrtllmGen.MxE4m3,
+        DtypeTrtllmGen.Bfloat16,
+    ]:
+        return False
+    return True
+
+
 def _maybe_get_cached_w3_w1_permute_indices(
     _cache_permute_indices,
     dst_w3_w1_weight: torch.Tensor,
@@ -282,7 +319,9 @@ def convert_to_block_layout(input_tensor: torch.Tensor, blockK: int) -> torch.Te
 def get_cutlass_fused_moe_module(backend: str = "100", use_fast_build: bool = False):
     if backend in ("120", "121"):
         module = gen_cutlass_fused_moe_sm120_module(use_fast_build).build_and_load()
-    elif backend in ("100", "103", "110"):
+    elif backend == "103":
+        module = gen_cutlass_fused_moe_sm103_module(use_fast_build).build_and_load()
+    elif backend in ("100", "110"):
         module = gen_cutlass_fused_moe_sm100_module(use_fast_build).build_and_load()
     elif backend == "90":
         module = gen_cutlass_fused_moe_sm90_module(use_fast_build).build_and_load()
@@ -291,10 +330,18 @@ def get_cutlass_fused_moe_module(backend: str = "100", use_fast_build: bool = Fa
     else:
         raise ValueError(f"Invalid backend: {backend}")
 
+    # Set DeepGEMM JIT include directories after module is loaded
+    from ..jit import env as jit_env
+
+    deepgemm_include_dir = str(
+        jit_env.FLASHINFER_CSRC_DIR / "nv_internal" / "tensorrt_llm"
+    )
+    module.set_deepgemm_jit_include_dirs([deepgemm_include_dir])
+
     class MoERunner(TunableRunner):
         # avoid overhead of creating a new runner in forward pass
         runner_dict: Dict[
-            Tuple[torch.dtype, torch.dtype, torch.dtype, bool, bool, bool], Any
+            Tuple[torch.dtype, torch.dtype, torch.dtype, bool, bool, bool, bool], Any
         ] = dict()
         tuning_config = TuningConfig(
             dynamic_tensor_specs=(
@@ -326,6 +373,7 @@ def __init__(
             min_latency_mode: bool,
             enable_pdl: bool,
             activation_type: ActivationType,
+            use_packed_weights: bool,
         ):
             self.x_dtype = x_dtype
             self.weight_dtype = weight_dtype
@@ -343,6 +391,7 @@ def __init__(
             self.use_mxfp8_act_scaling = use_mxfp8_act_scaling
             self.min_latency_mode = min_latency_mode
             self.enable_pdl = enable_pdl
+            self.use_packed_weights = use_packed_weights
             instance_key = (
                 x_dtype,
                 weight_dtype,
@@ -350,8 +399,11 @@ def __init__(
                 use_deepseek_fp8_block_scale,
                 use_w4_group_scaling,
                 use_mxfp8_act_scaling,
+                use_packed_weights,
             )
             self.activation_type = activation_type
+            # Set by tuning flow to indicate which GEMM stage (1 or 2) to filter tactics for
+            self.gemm_idx_for_tuning: Optional[int] = None
 
             if instance_key not in MoERunner.runner_dict:
                 MoERunner.runner_dict[instance_key] = module.init(
@@ -361,6 +413,7 @@ def __init__(
                     use_deepseek_fp8_block_scale,
                     use_w4_group_scaling,
                     use_mxfp8_act_scaling,
+                    use_packed_weights,
                 )
 
             self.fused_moe_runner = MoERunner.runner_dict[instance_key]
@@ -370,7 +423,20 @@ def get_valid_tactics(
             inputs: List[torch.Tensor],
             profile: OptimizationProfile,
         ) -> List[int]:
-            return list(range(self.fused_moe_runner.get_tactic_num()))
+            # Prefer filtering tactics by GEMM stage to avoid invalid combos during tuning
+            try:
+                gemm1_count = self.fused_moe_runner.get_gemm1_tactic_count()
+                gemm2_count = self.fused_moe_runner.get_gemm2_tactic_count()
+                total = gemm1_count + gemm2_count
+            except Exception:
+                return list(range(self.fused_moe_runner.get_tactic_num()))
+
+            stage = getattr(self, "gemm_idx_for_tuning", None)
+            if stage == 1:
+                return list(range(gemm1_count))
+            if stage == 2:
+                return list(range(gemm1_count, gemm1_count + gemm2_count))
+            return list(range(total))
 
         def forward(
             self,
@@ -455,6 +521,7 @@ def cutlass_fused_moe(
         tune_max_num_tokens: int = 8192,
         enable_pdl: Optional[bool] = None,
         activation_type: ActivationType = ActivationType.Swiglu,
+        use_packed_weights: bool = False,
     ) -> List[torch.Tensor]:
         if enable_pdl is None:
             enable_pdl = device_support_pdl(input.device)
@@ -480,8 +547,11 @@ def cutlass_fused_moe(
             min_latency_mode=min_latency_mode,
             enable_pdl=enable_pdl,
             activation_type=activation_type,
+            use_packed_weights=use_packed_weights,
         )
 
+        # Limit tactics to GEMM1 during tuning
+        moe_runner.gemm_idx_for_tuning = 1
         _, gemm_tactic_1 = tuner.choose_one(
             "trtllm::fused_moe::gemm1",
             [moe_runner],
@@ -496,6 +566,8 @@ def cutlass_fused_moe(
             gemm_idx=1,
         )
 
+        # Limit tactics to GEMM2 during tuning
+        moe_runner.gemm_idx_for_tuning = 2
         _, gemm_tactic_2 = tuner.choose_one(
             "trtllm::fused_moe::gemm2",
             [moe_runner],
@@ -605,6 +677,7 @@ def _fake_cutlass_fused_moe(
         min_latency_mode: bool = False,
         tune_max_num_tokens: int = 8192,
         enable_pdl: Optional[bool] = None,
+        use_packed_weights: bool = False,
     ):
         seq_len = input.shape[0]
         hidden_size = fc2_expert_weights.shape[1]
@@ -630,6 +703,7 @@ def _fake_cutlass_fused_moe(
 
 
 # ref: https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py#L121
+@flashinfer_api
 def cutlass_fused_moe(
     input: torch.Tensor,
     token_selected_experts: torch.Tensor,
@@ -656,6 +730,7 @@ def cutlass_fused_moe(
     use_w4_group_scaling: bool = False,
     use_mxfp8_act_scaling: bool = False,
     min_latency_mode: bool = False,
+    use_packed_weights: bool = False,
     tune_max_num_tokens: int = 8192,
     enable_pdl: Optional[bool] = None,
     activation_type: ActivationType = ActivationType.Swiglu,
@@ -760,6 +835,9 @@ def cutlass_fused_moe(
     min_latency_mode : bool = False
         Whether to use minimum latency mode. Defaults to False.
 
+    use_packed_weights : bool = False
+        Whether to use packed uint4x2 weights passed as packed uint8 values. Defaults to False.
+
     tune_max_num_tokens : int = 8192
         Maximum number of tokens for tuning. Defaults to 8192.
 
@@ -838,6 +916,7 @@ def cutlass_fused_moe(
         ep_rank,
         cluster_size,
         cluster_rank,
+        use_packed_weights=use_packed_weights,
         enable_alltoall=enable_alltoall,
         use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
         use_w4_group_scaling=use_w4_group_scaling,
@@ -915,8 +994,10 @@ def __init__(
             use_deepseek_fp8: bool,
             hidden_size: int,
             intermediate_size: int,
-            gated_act_type: int,
-            tile_tokens_dim: Optional[int] = None,
+            gated_act_type: int = GatedActType.SwiGlu,
+            use_shuffled_weight: bool = False,
+            weight_layout: int = WeightLayout.MajorK,
+            use_packed_weights: bool = False,
         ):
             self.num_local_experts = num_local_experts
             self.top_k = top_k
@@ -926,8 +1007,10 @@ def __init__(
             self.top_k = top_k
             self.hidden_size = hidden_size
             self.intermediate_size = intermediate_size
-            self.gated_act_type = gated_act_type
-            self.tile_tokens_dim = tile_tokens_dim
+            self.gated_act_type = GatedActType(gated_act_type)
+            self.use_shuffled_weight = use_shuffled_weight
+            self.weight_layout = WeightLayout(weight_layout)
+            self.use_packed_weights = use_packed_weights
 
         def get_valid_tactics(
             self,
@@ -943,18 +1026,8 @@ def get_valid_tactics(
                 *extra_inputs,
             ) = inputs
             num_tokens = routing_logits.shape[0]
-            tile_tokens_dim = (
-                calculate_tile_tokens_dim(
-                    num_tokens,
-                    self.num_local_experts,
-                    self.top_k,
-                    64 if self.dtype_act == DtypeTrtllmGen.Bfloat16 else 128,
-                )
-                if self.tile_tokens_dim is None
-                else self.tile_tokens_dim
-            )
+
             instance_key = (
-                tile_tokens_dim,
                 self.dtype_act,
                 self.dtype_weights,
                 self.use_deepseek_fp8,
@@ -963,6 +1036,8 @@ def get_valid_tactics(
                 self.intermediate_size,
                 self.num_local_experts,
                 self.gated_act_type,
+                self.use_shuffled_weight,
+                self.weight_layout,
                 num_tokens,
             )
             if instance_key not in MoERunner.valid_tactics_dict:
@@ -992,16 +1067,6 @@ def forward(
                 *extra_inputs,
             ) = inputs
             num_tokens = routing_logits.shape[0]
-            tile_tokens_dim = (
-                calculate_tile_tokens_dim(
-                    num_tokens,
-                    self.num_local_experts,
-                    self.top_k,
-                    64 if self.dtype_act == DtypeTrtllmGen.Bfloat16 else 128,
-                )
-                if self.tile_tokens_dim is None
-                else self.tile_tokens_dim
-            )
 
             extra_input_idx = 0
             if trtllm_gen_dtype_has_scale(self.dtype_act):
@@ -1026,42 +1091,155 @@ def forward(
                 hidden_states_scale.dim() == 2
                 and hidden_states_scale.shape[0] == num_tokens
             ), "hidden_states_scale's first dimension must be batch size"
-            # TODO(siyuan): support fp8
-            moe_op.trtllm_fp4_block_scale_moe(
-                routing_logits,
-                topk_ids,
-                expert_weights,
-                kwargs["routing_bias"],
-                hidden_states,
-                hidden_states_scale,  # hidden_states_scale
-                kwargs["gemm1_weights"],
-                kwargs["gemm1_weights_scale"],
-                kwargs["gemm1_bias"],
-                kwargs["gemm1_alpha"],
-                kwargs["gemm1_beta"],
-                kwargs["gemm1_clamp_limit"],
-                kwargs["gemm2_weights"],
-                kwargs["gemm2_weights_scale"],
-                kwargs["gemm2_bias"],
-                kwargs["output1_scale_scalar"],
-                kwargs["output1_scale_gate_scalar"],
-                kwargs["output2_scale_scalar"],
-                kwargs["num_experts"],
-                self.top_k,
-                kwargs["n_group"],
-                kwargs["topk_group"],
-                self.intermediate_size,
-                kwargs["local_expert_offset"],
-                self.num_local_experts,
-                kwargs["routed_scaling_factor"],
-                tile_tokens_dim,
-                kwargs["routing_method_type"],
-                kwargs["enable_pdl"],
-                kwargs["do_finalize"],
-                self.gated_act_type,
-                output,
-                tactic,
-            )
+            # Choose the appropriate operation based on data types
+            if self.dtype_weights == DtypeTrtllmGen.Bfloat16:
+                # BF16 operations
+                moe_op.trtllm_bf16_moe(
+                    routing_logits,
+                    kwargs["routing_bias"],
+                    hidden_states,
+                    kwargs["gemm1_weights"],
+                    kwargs["gemm2_weights"],
+                    kwargs["num_experts"],
+                    self.top_k,
+                    kwargs["n_group"],
+                    kwargs["topk_group"],
+                    self.intermediate_size,
+                    kwargs["local_expert_offset"],
+                    self.num_local_experts,
+                    kwargs["routing_method_type"],
+                    kwargs["use_shuffled_weight"],
+                    kwargs["weight_layout"],
+                    kwargs["enable_pdl"],
+                    [-1, -1] if tactic == -1 else tactic,
+                )
+            elif (
+                self.dtype_act == DtypeTrtllmGen.E4m3
+                and self.dtype_weights == DtypeTrtllmGen.E4m3
+            ):
+                # FP8 operations
+                if self.use_deepseek_fp8:
+                    # FP8 block scale
+                    current_num_tokens = hidden_states.shape[0]
+                    current_hidden_size = hidden_states.shape[1]
+                    current_hidden_states_scale = torch.full(
+                        (current_hidden_size // 128, current_num_tokens),
+                        2.0,
+                        dtype=torch.float,
+                        device=hidden_states.device,
+                    )
+                    moe_op.trtllm_fp8_block_scale_moe(
+                        routing_logits,
+                        kwargs["routing_bias"],
+                        hidden_states,
+                        current_hidden_states_scale,
+                        kwargs["gemm1_weights"],
+                        kwargs["gemm1_weights_scale"],
+                        kwargs["gemm2_weights"],
+                        kwargs["gemm2_weights_scale"],
+                        output,
+                        kwargs["num_experts"],
+                        self.top_k,
+                        kwargs["n_group"],
+                        kwargs["topk_group"],
+                        self.intermediate_size,
+                        kwargs["local_expert_offset"],
+                        self.num_local_experts,
+                        kwargs["routed_scaling_factor"],
+                        kwargs["routing_method_type"],
+                        kwargs["use_shuffled_weight"],
+                        kwargs["weight_layout"],
+                        kwargs["enable_pdl"],
+                        [-1, -1] if tactic == -1 else tactic,
+                    )
+                else:
+                    # FP8 per tensor scale
+                    moe_op.trtllm_fp8_per_tensor_scale_moe(
+                        routing_logits,
+                        kwargs["routing_bias"],
+                        hidden_states,
+                        kwargs["gemm1_weights"],
+                        kwargs["output1_scales_scalar"],
+                        kwargs["output1_scales_gate_scalar"],
+                        kwargs["gemm2_weights"],
+                        kwargs["output2_scales_scalar"],
+                        output,
+                        kwargs["num_experts"],
+                        self.top_k,
+                        kwargs["n_group"],
+                        kwargs["topk_group"],
+                        self.intermediate_size,
+                        kwargs["local_expert_offset"],
+                        self.num_local_experts,
+                        kwargs["routed_scaling_factor"],
+                        kwargs["use_routing_scales_on_input"],
+                        kwargs["routing_method_type"],
+                        kwargs["enable_pdl"],
+                        [-1, -1] if tactic == -1 else tactic,
+                    )
+            elif (
+                self.dtype_act == DtypeTrtllmGen.Bfloat16
+                and self.dtype_weights == DtypeTrtllmGen.MxInt4
+            ):
+                moe_op.trtllm_mxint4_block_scale_moe(
+                    routing_logits,
+                    kwargs["routing_bias"],
+                    hidden_states,
+                    kwargs["gemm1_weights"],
+                    kwargs["gemm1_weights_scale"],
+                    kwargs["gemm1_alpha"],
+                    kwargs["gemm1_beta"],
+                    kwargs["gemm1_clamp_limit"],
+                    kwargs["gemm2_weights"],
+                    kwargs["gemm2_weights_scale"],
+                    kwargs["num_experts"],
+                    self.top_k,
+                    kwargs["n_group"],
+                    kwargs["topk_group"],
+                    self.intermediate_size,
+                    kwargs["local_expert_offset"],
+                    self.num_local_experts,
+                    kwargs["routed_scaling_factor"],
+                    kwargs["routing_method_type"],
+                    kwargs["enable_pdl"],
+                    output,
+                    [-1, -1] if tactic == -1 else tactic,
+                )
+            else:
+                moe_op.trtllm_fp4_block_scale_moe(
+                    routing_logits,
+                    topk_ids,
+                    expert_weights,
+                    kwargs["routing_bias"],
+                    hidden_states,
+                    hidden_states_scale,  # hidden_states_scale
+                    kwargs["gemm1_weights"],
+                    kwargs["gemm1_weights_scale"],
+                    kwargs["gemm1_bias"],
+                    kwargs["gemm1_alpha"],
+                    kwargs["gemm1_beta"],
+                    kwargs["gemm1_clamp_limit"],
+                    kwargs["gemm2_weights"],
+                    kwargs["gemm2_weights_scale"],
+                    kwargs["gemm2_bias"],
+                    kwargs["output1_scale_scalar"],
+                    kwargs["output1_scale_gate_scalar"],
+                    kwargs["output2_scale_scalar"],
+                    kwargs["num_experts"],
+                    self.top_k,
+                    kwargs["n_group"],
+                    kwargs["topk_group"],
+                    self.intermediate_size,
+                    kwargs["local_expert_offset"],
+                    self.num_local_experts,
+                    kwargs["routed_scaling_factor"],
+                    kwargs["routing_method_type"],
+                    kwargs["enable_pdl"],
+                    kwargs["do_finalize"],
+                    self.gated_act_type,
+                    output,
+                    [-1, -1] if tactic == -1 else tactic,
+                )
 
         @classmethod
         @functools.lru_cache(maxsize=None)
@@ -1089,6 +1267,134 @@ def refine_tuning_config(cls, tune_max_num_tokens: int):
                 ),
             )
 
+    @register_custom_op(
+        "flashinfer::trtllm_bf16_moe",
+        mutates_args=(""),
+    )
+    def trtllm_bf16_moe_op(
+        routing_logits: torch.Tensor,
+        routing_bias: Optional[torch.Tensor],
+        hidden_states: torch.Tensor,
+        gemm1_weights: torch.Tensor,
+        gemm2_weights: torch.Tensor,
+        num_experts: int,
+        top_k: int,
+        n_group: Optional[int],
+        topk_group: Optional[int],
+        intermediate_size: int,
+        local_expert_offset: int,
+        local_num_experts: int,
+        routing_method_type: int,
+        use_shuffled_weight: bool,
+        weight_layout: int,
+        enable_pdl: Optional[bool] = None,
+        tune_max_num_tokens: int = 8192,
+    ) -> torch.Tensor:
+        if enable_pdl is None:
+            enable_pdl = device_support_pdl(hidden_states.device)
+
+        # Use AutoTuner to select the best tactic
+        tuner = AutoTuner.get()
+        MoERunner.refine_tuning_config(tune_max_num_tokens)
+
+        num_tokens = hidden_states.shape[0]
+        hidden_size = hidden_states.shape[-1]
+
+        # Create workspace buffers
+        output = torch.empty(
+            num_tokens, hidden_size, dtype=torch.bfloat16, device=hidden_states.device
+        )
+        topk_ids = torch.empty(
+            num_tokens, top_k, dtype=torch.int32, device=hidden_states.device
+        )
+        expert_weights = torch.empty(
+            num_tokens, top_k, dtype=routing_logits.dtype, device=hidden_states.device
+        )
+
+        dtype_act = DtypeTrtllmGen.Bfloat16
+        dtype_weights = DtypeTrtllmGen.Bfloat16
+
+        moe_runner = MoERunner(
+            top_k=top_k,
+            num_local_experts=local_num_experts,
+            dtype_act=dtype_act,
+            dtype_weights=dtype_weights,
+            use_deepseek_fp8=False,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            weight_layout=weight_layout,
+            use_shuffled_weight=use_shuffled_weight,
+            gated_act_type=GatedActType.SwiGlu,  # Default for BF16
+        )
+
+        inputs = [output, routing_logits, topk_ids, expert_weights, hidden_states]
+
+        _, tactic = tuner.choose_one(
+            "flashinfer::trtllm_bf16_moe",
+            [moe_runner],
+            MoERunner.tuning_config_no_hidden_states_scales,
+            inputs,
+            routing_bias=routing_bias,
+            gemm1_weights=gemm1_weights,
+            gemm2_weights=gemm2_weights,
+            num_experts=num_experts,
+            n_group=n_group,
+            topk_group=topk_group,
+            local_expert_offset=local_expert_offset,
+            local_num_experts=local_num_experts,
+            routing_method_type=routing_method_type,
+            use_shuffled_weight=use_shuffled_weight,
+            weight_layout=weight_layout,
+            enable_pdl=enable_pdl,
+        )
+
+        # Call the C++ function with the selected tactic
+        result = moe_op.trtllm_bf16_moe(
+            routing_logits,
+            routing_bias,
+            hidden_states,
+            gemm1_weights,
+            gemm2_weights,
+            num_experts,
+            top_k,
+            n_group,
+            topk_group,
+            intermediate_size,
+            local_expert_offset,
+            local_num_experts,
+            routing_method_type,
+            use_shuffled_weight,
+            weight_layout,
+            enable_pdl,
+            [-1, -1] if tactic == -1 else tactic,
+        )
+        return result
+
+    @register_fake_op("flashinfer::trtllm_bf16_moe")
+    def _fake_trtllm_bf16_moe(
+        routing_logits: torch.Tensor,
+        routing_bias: Optional[torch.Tensor],
+        hidden_states: torch.Tensor,
+        gemm1_weights: torch.Tensor,
+        gemm2_weights: torch.Tensor,
+        num_experts: int,
+        top_k: int,
+        n_group: Optional[int],
+        topk_group: Optional[int],
+        intermediate_size: int,
+        local_expert_offset: int,
+        local_num_experts: int,
+        routing_method_type: int,
+        use_shuffled_weight: bool,
+        weight_layout: int,
+        enable_pdl: Optional[bool] = None,
+        tune_max_num_tokens: int = 8192,
+    ):
+        seq_len = hidden_states.shape[0]
+        hidden_size = hidden_states.shape[1]
+
+        return [hidden_states.new_empty([seq_len, hidden_size], dtype=torch.bfloat16)]
+
     @register_custom_op(
         "flashinfer::trtllm_fp8_per_tensor_scale_moe",
         mutates_args=(""),
@@ -1111,17 +1417,70 @@ def trtllm_fp8_per_tensor_scale_moe_op(
         local_num_experts: int,
         routed_scaling_factor: Optional[float],
         use_routing_scales_on_input: bool,
-        tile_tokens_dim: int = 8,
         routing_method_type: int = 0,
         enable_pdl: Optional[bool] = None,
+        tune_max_num_tokens: int = 8192,
     ) -> torch.Tensor:
         if enable_pdl is None:
             enable_pdl = device_support_pdl(hidden_states.device)
+        # Use AutoTuner to select the best tactic
+        tuner = AutoTuner.get()
+        MoERunner.refine_tuning_config(tune_max_num_tokens)
+
+        num_tokens = hidden_states.shape[0]
+        hidden_size = hidden_states.shape[-1]
+
+        # Create workspace buffers
         output = torch.empty(
-            hidden_states.shape, dtype=torch.bfloat16, device=hidden_states.device
+            num_tokens, hidden_size, dtype=torch.bfloat16, device=hidden_states.device
+        )
+        topk_ids = torch.empty(
+            num_tokens, top_k, dtype=torch.int32, device=hidden_states.device
+        )
+        expert_weights = torch.empty(
+            num_tokens, top_k, dtype=routing_logits.dtype, device=hidden_states.device
+        )
+
+        dtype_act = DtypeTrtllmGen.E4m3  # FP8 activation
+        dtype_weights = DtypeTrtllmGen.E4m3  # FP8 weights
+
+        moe_runner = MoERunner(
+            top_k=top_k,
+            num_local_experts=local_num_experts,
+            dtype_act=dtype_act,
+            dtype_weights=dtype_weights,
+            use_deepseek_fp8=False,  # per_tensor mode
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            weight_layout=WeightLayout.MajorK,
+            use_shuffled_weight=True,
+        )
+
+        inputs = [output, routing_logits, topk_ids, expert_weights, hidden_states]
+
+        _, tactic = tuner.choose_one(
+            "flashinfer::trtllm_fp8_per_tensor_scale_moe",
+            [moe_runner],
+            MoERunner.tuning_config_no_hidden_states_scales,  # FP8 per-tensor doesn't use hidden_states_scale
+            inputs,
+            routing_bias=routing_bias,
+            gemm1_weights=gemm1_weights,
+            output1_scales_scalar=output1_scales_scalar,
+            output1_scales_gate_scalar=output1_scales_gate_scalar,
+            gemm2_weights=gemm2_weights,
+            output2_scales_scalar=output2_scales_scalar,
+            num_experts=num_experts,
+            n_group=n_group,
+            topk_group=topk_group,
+            local_expert_offset=local_expert_offset,
+            local_num_experts=local_num_experts,
+            routed_scaling_factor=routed_scaling_factor,
+            use_routing_scales_on_input=use_routing_scales_on_input,
+            routing_method_type=routing_method_type,
+            enable_pdl=enable_pdl,
         )
         # Call the C++ function
-        moe_op.trtllm_fp8_per_tensor_scale_moe(
+        result = moe_op.trtllm_fp8_per_tensor_scale_moe(
             routing_logits,
             routing_bias,
             hidden_states,
@@ -1140,11 +1499,12 @@ def trtllm_fp8_per_tensor_scale_moe_op(
             local_num_experts,
             routed_scaling_factor,
             use_routing_scales_on_input,
-            tile_tokens_dim,
             routing_method_type,
             enable_pdl,
+            [-1, -1] if tactic == -1 else tactic,
         )
-        return output
+
+        return result
 
     @register_fake_op("flashinfer::trtllm_fp8_per_tensor_scale_moe")
     def _fake_trtllm_fp8_per_tensor_scale_moe(
@@ -1165,7 +1525,6 @@ def _fake_trtllm_fp8_per_tensor_scale_moe(
         local_num_experts: int,
         routed_scaling_factor: Optional[float],
         use_routing_scales_on_input: bool,
-        tile_tokens_dim: int = 8,
         routing_method_type: int = 0,
         enable_pdl: Optional[bool] = None,
     ):
@@ -1196,17 +1555,80 @@ def trtllm_fp8_block_scale_moe_op(
         local_expert_offset: int,
         local_num_experts: int,
         routed_scaling_factor: Optional[float],
-        tile_tokens_dim: int,
         routing_method_type: int,
         use_shuffled_weight: bool = False,
         weight_layout: int = 0,
         enable_pdl: Optional[bool] = None,
+        tune_max_num_tokens: int = 8192,
     ) -> torch.Tensor:
         if enable_pdl is None:
             enable_pdl = device_support_pdl(hidden_states.device)
 
+        # Use AutoTuner to select the best tactic - follow FP4 pattern exactly
+        tuner = AutoTuner.get()
+        MoERunner.refine_tuning_config(tune_max_num_tokens)
+
+        num_tokens = hidden_states.shape[0]
+        hidden_size = hidden_states.shape[-1]
+
+        # Create workspace buffers
+        output = torch.empty(
+            num_tokens, hidden_size, dtype=torch.bfloat16, device=hidden_states.device
+        )
+        topk_ids = torch.empty(
+            num_tokens, top_k, dtype=torch.int32, device=hidden_states.device
+        )
+        expert_weights = torch.empty(
+            num_tokens, top_k, dtype=routing_logits.dtype, device=hidden_states.device
+        )
+
+        dtype_act = DtypeTrtllmGen.E4m3  # FP8 activation
+        dtype_weights = DtypeTrtllmGen.E4m3  # FP8 weights
+
+        moe_runner = MoERunner(
+            top_k=top_k,
+            num_local_experts=local_num_experts,
+            dtype_act=dtype_act,
+            dtype_weights=dtype_weights,
+            use_deepseek_fp8=True,  # block_scale mode
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            weight_layout=weight_layout,
+            use_shuffled_weight=use_shuffled_weight,
+        )
+
+        inputs = [
+            output,
+            routing_logits,
+            topk_ids,
+            expert_weights,
+            hidden_states,
+            hidden_states_scale,
+        ]
+
+        _, tactic = tuner.choose_one(
+            "flashinfer::trtllm_fp8_block_scale_moe",
+            [moe_runner],
+            MoERunner.tuning_config_with_hidden_states_scales,  # FP8 block-scale uses hidden_states_scale
+            inputs,
+            routing_bias=routing_bias,
+            gemm1_weights=gemm1_weights,
+            gemm1_weights_scale=gemm1_weights_scale,
+            gemm2_weights=gemm2_weights,
+            gemm2_weights_scale=gemm2_weights_scale,
+            num_experts=num_experts,
+            n_group=n_group,
+            topk_group=topk_group,
+            local_expert_offset=local_expert_offset,
+            local_num_experts=local_num_experts,
+            routed_scaling_factor=routed_scaling_factor,
+            routing_method_type=routing_method_type,
+            use_shuffled_weight=use_shuffled_weight,
+            weight_layout=weight_layout,
+            enable_pdl=enable_pdl,
+        )
         # Call the C++ function for block scale MoE
-        moe_op.trtllm_fp8_block_scale_moe(
+        result = moe_op.trtllm_fp8_block_scale_moe(
             routing_logits,
             routing_bias,
             hidden_states,
@@ -1224,14 +1646,14 @@ def trtllm_fp8_block_scale_moe_op(
             local_expert_offset,
             local_num_experts,
             routed_scaling_factor,
-            tile_tokens_dim,
             routing_method_type,
             use_shuffled_weight,
             weight_layout,
             enable_pdl,
+            [-1, -1] if tactic == -1 else tactic,
         )
 
-        return output
+        return result
 
     @register_fake_op("flashinfer::trtllm_fp8_block_scale_moe")
     def _fake_trtllm_fp8_block_scale_moe(
@@ -1252,7 +1674,6 @@ def _fake_trtllm_fp8_block_scale_moe(
         local_expert_offset: int,
         local_num_experts: int,
         routed_scaling_factor: Optional[float],
-        tile_tokens_dim: int = 8,
         routing_method_type: int = 0,
         use_shuffled_weight: bool = False,
         weight_layout: int = 0,
@@ -1294,7 +1715,6 @@ def trtllm_fp4_block_scale_moe_op(
         local_expert_offset: int,
         num_local_experts: int,
         routed_scaling_factor: Optional[float],
-        tile_tokens_dim: Optional[int],
         routing_method_type: int,
         do_finalize: bool,
         enable_pdl: Optional[bool] = None,
@@ -1340,13 +1760,6 @@ def trtllm_fp4_block_scale_moe_op(
         dtype_weights = deduce_trtllm_gen_tensor_dtype(
             gemm1_weights, gemm1_weights_scale
         )
-        if tile_tokens_dim is None:
-            tile_tokens_dim = calculate_tile_tokens_dim(
-                num_tokens,
-                num_experts,
-                top_k,
-                max_tile_tokens_dim=64 if dtype_act == DtypeTrtllmGen.Bfloat16 else 128,
-            )
         moe_runner = MoERunner(
             top_k=top_k,
             num_local_experts=num_local_experts,
@@ -1356,9 +1769,8 @@ def trtllm_fp4_block_scale_moe_op(
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
             gated_act_type=gated_act_type,
-            # NOTE(siyuan): do not fix the tile_tokens_dim to let tunnable runner decide the tile_tokens_dim itself.
-            # however, when the user chooses a different heuristic for tile_tokens_dim, the autotuner will fail to find the correct cached tactics.
-            # tile_tokens_dim=tile_tokens_dim,
+            weight_layout=WeightLayout.MajorK,
+            use_shuffled_weight=True,
         )
         tunning_config = (
             MoERunner.tuning_config_no_hidden_states_scales
@@ -1434,13 +1846,12 @@ def trtllm_fp4_block_scale_moe_op(
             local_expert_offset,
             num_local_experts,
             routed_scaling_factor,
-            tile_tokens_dim,
             routing_method_type,
             do_finalize,
             enable_pdl,
             gated_act_type,
             output,
-            tactic,
+            [-1, -1] if tactic == -1 else tactic,
         )
         if do_finalize:
             return [output]
@@ -1480,7 +1891,6 @@ def _fake_trtllm_fp4_block_scale_moe(
         local_expert_offset: int,
         local_num_experts: int,
         routed_scaling_factor: Optional[float],
-        tile_tokens_dim: Optional[int],
         routing_method_type: int,
         do_finalize: bool,
         enable_pdl: bool,
@@ -1493,13 +1903,252 @@ def _fake_trtllm_fp4_block_scale_moe(
 
         return [hidden_states.new_empty([seq_len, hidden_size], dtype=torch.bfloat16)]
 
+    @register_custom_op(
+        "flashinfer::trtllm_mxint4_block_scale_moe",
+        mutates_args=(""),
+    )
+    def trtllm_mxint4_block_scale_moe_op(
+        routing_logits: torch.Tensor,
+        routing_bias: Optional[torch.Tensor],
+        hidden_states: torch.Tensor,
+        gemm1_weights: torch.Tensor,
+        gemm1_weights_scale: torch.Tensor,
+        gemm1_alpha: Optional[torch.Tensor],
+        gemm1_beta: Optional[torch.Tensor],
+        gemm1_clamp_limit: Optional[torch.Tensor],
+        gemm2_weights: torch.Tensor,
+        gemm2_weights_scale: torch.Tensor,
+        num_experts: int,
+        top_k: int,
+        n_group: Optional[int],
+        topk_group: Optional[int],
+        intermediate_size: int,
+        local_expert_offset: int,
+        num_local_experts: int,
+        routed_scaling_factor: Optional[float],
+        routing_method_type: int,
+        enable_pdl: Optional[bool] = None,
+        output: Optional[torch.Tensor] = None,
+        tune_max_num_tokens: int = 8192,
+    ) -> List[torch.Tensor]:
+        routing_dtype = routing_logits.dtype
+        hidden_size = hidden_states.shape[-1]
+        if hidden_states.dtype == torch.uint8:
+            hidden_size = hidden_size * 2
+        num_tokens = hidden_states.shape[0]
+
+        # workspace buffers required by trtllm-gen
+        topk_ids = torch.empty(
+            num_tokens, top_k, dtype=torch.int32, device=hidden_states.device
+        )
+        expert_weights = torch.empty(
+            num_tokens, top_k, dtype=routing_dtype, device=hidden_states.device
+        )
+        if enable_pdl is None:
+            enable_pdl = device_support_pdl(hidden_states.device)
+        if output is None:
+            output = torch.empty(
+                num_tokens,
+                hidden_size,
+                dtype=torch.bfloat16,
+                device=hidden_states.device,
+            )
+
+        tuner = AutoTuner.get()
+        MoERunner.refine_tuning_config(tune_max_num_tokens)
+        dtype_act = DtypeTrtllmGen.Bfloat16
+        dtype_weights = DtypeTrtllmGen.MxInt4
+        moe_runner = MoERunner(
+            top_k=top_k,
+            num_local_experts=num_local_experts,
+            dtype_act=dtype_act,
+            dtype_weights=dtype_weights,
+            use_deepseek_fp8=False,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            gated_act_type=GatedActType.SwiGlu,
+            weight_layout=WeightLayout.BlockMajorK,
+            use_shuffled_weight=True,
+        )
+        tunning_config = MoERunner.tuning_config_no_hidden_states_scales
+        inputs = [
+            output,
+            routing_logits,
+            topk_ids,
+            expert_weights,
+            hidden_states,
+        ]
+
+        _, tactic = tuner.choose_one(
+            "flashinfer::trtllm_mxint4_block_scale_moe",
+            [moe_runner],
+            tunning_config,
+            inputs,
+            num_experts=num_experts,
+            routing_bias=routing_bias,
+            gemm1_weights=gemm1_weights,
+            gemm1_weights_scale=gemm1_weights_scale,
+            gemm1_alpha=gemm1_alpha,
+            gemm1_beta=gemm1_beta,
+            gemm1_clamp_limit=gemm1_clamp_limit,
+            gemm2_weights=gemm2_weights,
+            gemm2_weights_scale=gemm2_weights_scale,
+            n_group=n_group,
+            topk_group=topk_group,
+            local_expert_offset=local_expert_offset,
+            routed_scaling_factor=routed_scaling_factor,
+            routing_method_type=routing_method_type,
+            enable_pdl=enable_pdl,
+        )
+
+        # Call the C++ function for block scale MoE
+        moe_op.trtllm_mxint4_block_scale_moe(
+            routing_logits,
+            routing_bias,
+            hidden_states,
+            gemm1_weights,
+            gemm1_weights_scale,
+            gemm1_alpha,
+            gemm1_beta,
+            gemm1_clamp_limit,
+            gemm2_weights,
+            gemm2_weights_scale,
+            num_experts,
+            top_k,
+            n_group,
+            topk_group,
+            intermediate_size,
+            local_expert_offset,
+            num_local_experts,
+            routed_scaling_factor,
+            routing_method_type,
+            enable_pdl,
+            output,
+            [-1, -1] if tactic == -1 else tactic,
+        )
+        return output
+
+    @register_fake_op("flashinfer::trtllm_mxint4_block_scale_moe")
+    def _fake_trtllm_mxint4_block_scale_moe(
+        routing_logits: torch.Tensor,
+        routing_bias: Optional[torch.Tensor],
+        hidden_states: torch.Tensor,
+        gemm1_weights: torch.Tensor,
+        gemm1_weights_scale: torch.Tensor,
+        gemm1_alpha: Optional[torch.Tensor],
+        gemm1_beta: Optional[torch.Tensor],
+        gemm1_clamp_limit: Optional[torch.Tensor],
+        gemm2_weights: torch.Tensor,
+        gemm2_weights_scale: torch.Tensor,
+        num_experts: int,
+        top_k: int,
+        n_group: Optional[int],
+        topk_group: Optional[int],
+        intermediate_size: int,
+        local_expert_offset: int,
+        local_num_experts: int,
+        routed_scaling_factor: Optional[float],
+        routing_method_type: int,
+        enable_pdl: bool,
+        output: Optional[torch.Tensor],
+        tune_max_num_tokens: int,
+    ):
+        seq_len = hidden_states.shape[0]
+        hidden_size = hidden_states.shape[1]
+
+        return [hidden_states.new_empty([seq_len, hidden_size], dtype=torch.bfloat16)]
+
     return SimpleNamespace(
+        trtllm_bf16_moe=trtllm_bf16_moe_op,
         trtllm_fp8_per_tensor_scale_moe=trtllm_fp8_per_tensor_scale_moe_op,
         trtllm_fp8_block_scale_moe=trtllm_fp8_block_scale_moe_op,
         trtllm_fp4_block_scale_moe=trtllm_fp4_block_scale_moe_op,
+        trtllm_mxint4_block_scale_moe=trtllm_mxint4_block_scale_moe_op,
+    )
+
+
+@flashinfer_api
+def trtllm_bf16_moe(
+    routing_logits: torch.Tensor,
+    routing_bias: Optional[torch.Tensor],
+    hidden_states: torch.Tensor,
+    gemm1_weights: torch.Tensor,
+    gemm2_weights: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    n_group: Optional[int],
+    topk_group: Optional[int],
+    intermediate_size: int,
+    local_expert_offset: int,
+    local_num_experts: int,
+    routing_method_type: int = 0,
+    use_shuffled_weight: bool = True,
+    weight_layout: int = WeightLayout.BlockMajorK,
+    enable_pdl: bool = True,
+    tune_max_num_tokens: int = 8192,
+) -> torch.Tensor:
+    """BF16 MoE operation with autotuning support.
+
+    This function implements a bfloat16 Mixture of Experts layer using the TensorRT-LLM backend
+    with automatic performance tuning for optimal tile size selection.
+
+    Args:
+        routing_logits: [seq_len, num_experts] tensor of routing logits.
+            Supports float32 or bfloat16.
+        routing_bias: Optional [num_experts] tensor of routing bias.
+            Must be bfloat16 if provided.
+        hidden_states: [seq_len, hidden_size] tensor of input hidden states.
+            Must be bfloat16.
+        gemm1_weights: [num_experts, 2*intermediate_size, hidden_size] tensor of first layer weights.
+            Must be bfloat16.
+        gemm2_weights: [num_experts, hidden_size, intermediate_size] tensor of second layer weights.
+            Must be bfloat16.
+        num_experts: Total number of experts.
+        top_k: Number of experts to route to per token.
+        n_group: Number of expert groups.
+        topk_group: Number of groups to consider for top-k routing.
+        intermediate_size: Size of intermediate layer.
+        local_expert_offset: Offset of local experts in global expert space.
+        local_num_experts: Number of experts handled by this device.
+        routing_method_type: Type of routing method to use (default: 0).
+            - 0: Default (Softmax -> TopK)
+            - 1: Renormalize (TopK -> Softmax)
+            - 2: DeepSeekV3 (Sigmoid -> RoutingBiasAdd -> Top2 in group -> Top4 groups -> Top8 experts)
+            - 3: Llama4 (Top1 -> Sigmoid)
+            - 4: RenormalizeNaive (Softmax -> TopK -> Renormalize)
+        use_shuffled_weight: Whether to use shuffled weight layout for optimization (default: True).
+        weight_layout: Weight layout format (default: WeightLayout.BlockMajorK).
+            - 0: MajorK - K-major layout [Mn, K]
+            - 1: MajorMn - M-major for A and N-major for B [K, Mn]
+            - 2: BlockMajorK - Blocked along K dimension [K/blockK, Mn, blockK]
+        enable_pdl: Whether to enable Programmatic Dependent Launch. Auto-enabled for >= sm90.
+        tune_max_num_tokens: Maximum number of tokens for autotuning (default: 8192).
+
+    Returns:
+        torch.Tensor: Output tensor of shape [seq_len, hidden_size].
+    """
+    return get_trtllm_moe_sm100_module().trtllm_bf16_moe(
+        routing_logits,
+        routing_bias,
+        hidden_states,
+        gemm1_weights,
+        gemm2_weights,
+        num_experts,
+        top_k,
+        n_group,
+        topk_group,
+        intermediate_size,
+        local_expert_offset,
+        local_num_experts,
+        routing_method_type,
+        use_shuffled_weight,
+        weight_layout,
+        enable_pdl,
+        tune_max_num_tokens,
     )
 
 
+@flashinfer_api
 def trtllm_fp8_per_tensor_scale_moe(
     routing_logits: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
@@ -1518,9 +2167,9 @@ def trtllm_fp8_per_tensor_scale_moe(
     local_num_experts: int,
     routed_scaling_factor: Optional[float],
     use_routing_scales_on_input: bool,
-    tile_tokens_dim: int = 8,
     routing_method_type: int = 0,
     enable_pdl: Optional[bool] = None,
+    tune_max_num_tokens: int = 8192,
 ) -> torch.Tensor:
     """FP8 per tensor scale MoE operation.
 
@@ -1542,9 +2191,9 @@ def trtllm_fp8_per_tensor_scale_moe(
         local_num_experts: Number of experts handled by this device
         routed_scaling_factor: Scaling factor for routing
         use_routing_scales_on_input: Whether to use routing scales on input
-        tile_tokens_dim: Tile dimension for tokens (default: 8)
         routing_method_type: Type of routing method to use (default: 0)
         enable_pdl: Whether to enable Programmatic Dependent Launch (PDL). Auto-enabled for >= sm90.
+        tune_max_num_tokens(int): Maximum number of tokens for tuning. (default: 8192)
 
     Returns:
         torch.Tensor: Output tensor of shape [seq_len, hidden_size]
@@ -1567,12 +2216,13 @@ def trtllm_fp8_per_tensor_scale_moe(
         local_num_experts,
         routed_scaling_factor,
         use_routing_scales_on_input,
-        tile_tokens_dim,
         routing_method_type,
         enable_pdl,
+        tune_max_num_tokens,
     )
 
 
+@flashinfer_api
 def trtllm_fp8_block_scale_moe(
     routing_logits: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
@@ -1590,11 +2240,11 @@ def trtllm_fp8_block_scale_moe(
     local_expert_offset: int,
     local_num_experts: int,
     routed_scaling_factor: Optional[float],
-    tile_tokens_dim: int = 8,
     routing_method_type: int = 0,
     use_shuffled_weight: bool = False,
     weight_layout: int = 0,
     enable_pdl: Optional[bool] = None,
+    tune_max_num_tokens: int = 8192,
 ) -> torch.Tensor:
     """FP8 block scale MoE operation.
 
@@ -1615,9 +2265,9 @@ def trtllm_fp8_block_scale_moe(
         local_expert_offset: Offset of local experts in global expert space
         local_num_experts: Number of experts handled by this device
         routed_scaling_factor: Scaling factor for routing
-        tile_tokens_dim: Tile dimension for tokens (default: 8)
         routing_method_type: Type of routing method to use (default: 0)
         enable_pdl: Whether to enable Programmatic Dependent Launch (PDL). Auto-enabled for >= sm90.
+        tune_max_num_tokens(int): Maximum number of tokens for tuning. (default: 8192)
     Returns:
         torch.Tensor: Output tensor of shape [seq_len, hidden_size]
     """
@@ -1642,14 +2292,15 @@ def trtllm_fp8_block_scale_moe(
         local_expert_offset,
         local_num_experts,
         routed_scaling_factor,
-        tile_tokens_dim,
         routing_method_type,
         use_shuffled_weight,
         weight_layout,
         enable_pdl,
+        tune_max_num_tokens,
     )
 
 
+@flashinfer_api
 def trtllm_fp4_block_scale_moe(
     routing_logits: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
@@ -1675,7 +2326,6 @@ def trtllm_fp4_block_scale_moe(
     local_expert_offset: int,
     local_num_experts: int,
     routed_scaling_factor: Optional[float],
-    tile_tokens_dim: Optional[int] = None,
     routing_method_type: int = 0,
     do_finalize: bool = True,
     enable_pdl: Optional[bool] = None,
@@ -1726,7 +2376,6 @@ def trtllm_fp4_block_scale_moe(
         local_expert_offset (int): Offset of local experts in global expert space
         local_num_experts (int): Number of experts handled by this device
         routed_scaling_factor (Optional[float]): Scaling factor for routing (can be None for some routing methods)
-        tile_tokens_dim (int): Tile dimension for tokens (default: 8)
         routing_method_type (int): Type of routing method to use (default: 0)
             - 0: Default (Softmax -> TopK)
             - 1: Renormalize (TopK -> Softmax)
@@ -1772,7 +2421,6 @@ def trtllm_fp4_block_scale_moe(
         local_expert_offset,
         local_num_experts,
         routed_scaling_factor,
-        tile_tokens_dim,
         routing_method_type,
         do_finalize,
         enable_pdl,
@@ -1782,6 +2430,7 @@ def trtllm_fp4_block_scale_moe(
     )
 
 
+@flashinfer_api
 def trtllm_fp4_block_scale_routed_moe(
     topk_ids: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
@@ -1807,7 +2456,6 @@ def trtllm_fp4_block_scale_routed_moe(
     local_expert_offset: int,
     local_num_experts: int,
     routed_scaling_factor: Optional[float],
-    tile_tokens_dim: Optional[int] = None,
     routing_method_type: int = 0,
     do_finalize: bool = True,
     enable_pdl: Optional[bool] = None,
@@ -1860,7 +2508,6 @@ def trtllm_fp4_block_scale_routed_moe(
         local_expert_offset (int): Offset of local experts in global expert space
         local_num_experts (int): Number of experts handled by this device
         routed_scaling_factor (Optional[float]): Scaling factor for routing (can be None for some routing methods)
-        tile_tokens_dim (int): Tile dimension for tokens (default: 8)
         routing_method_type (int): Type of routing method to use (default: 0)
             - 0: Default (Softmax -> TopK)
             - 1: Renormalize (TopK -> Softmax)
@@ -1906,7 +2553,6 @@ def trtllm_fp4_block_scale_routed_moe(
         local_expert_offset,
         local_num_experts,
         routed_scaling_factor,
-        tile_tokens_dim,
         routing_method_type,
         do_finalize,
         enable_pdl,
@@ -1914,3 +2560,95 @@ def trtllm_fp4_block_scale_routed_moe(
         output,
         tune_max_num_tokens,
     )
+
+
+@flashinfer_api
+def trtllm_mxint4_block_scale_moe(
+    routing_logits: torch.Tensor,
+    hidden_states: torch.Tensor,
+    gemm1_weights: torch.Tensor,
+    gemm1_weights_scale: torch.Tensor,
+    gemm1_alpha: Optional[torch.Tensor],
+    gemm1_beta: Optional[torch.Tensor],
+    gemm1_clamp_limit: Optional[torch.Tensor],
+    gemm2_weights: torch.Tensor,
+    gemm2_weights_scale: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    n_group: Optional[int],
+    topk_group: Optional[int],
+    intermediate_size: int,
+    local_expert_offset: int,
+    local_num_experts: int,
+    routed_scaling_factor: Optional[float],
+    routing_method_type: int = 0,
+    enable_pdl: Optional[bool] = None,
+    output: Optional[torch.Tensor] = None,
+    tune_max_num_tokens: int = 8192,
+) -> List[torch.Tensor]:
+    """MxInt4 block scale MoE operation.
+
+    Args:
+        routing_logits (torch.Tensor): shape [seq_len, num_experts]
+            Input tensor of routing logits. Supports float32, bfloat16.
+        hidden_states (torch.Tensor): shape [seq_len, hidden_size]
+            Tensor of input hidden states. Supports bfloat16.
+        gemm1_weights (torch.Tensor): shape [num_experts, 2 * intermediate_size, hidden_size // 2]
+            Tensor of FC1 weights. Dtype must be uint8 (packed mxint4)
+        gemm1_weights_scale (torch.Tensor): shape [num_experts, 2 * intermediate_size, hidden_size // 32]
+            Scale tensor of FC1 weights. Dtype must be bfloat16.
+        gemm1_alpha (Optional[torch.Tensor]): shape [num_experts]
+            Tensor of swiglu alpha. Dtype is float32.
+        gemm1_beta (Optional[torch.Tensor]): shape [num_experts]
+            Tensor of swiglu beta. Dtype is float32.
+        gemm1_clamp_limit (Optional[torch.Tensor]): shape [num_experts]
+            Tensor of swiglu clamp limit. Dtype is float32.
+        gemm2_weights (torch.Tensor): shape [num_experts, hidden_size, intermediate_size]
+            Tensor of FC2 weights. Dtype must be uint8 (packed mxint4)
+        gemm2_weights_scale (torch.Tensor): shape [num_experts, hidden_size, intermediate_size // 32]
+            Scale tensor of FC2 weights. Dtype must be bfloat16.
+        num_experts (int): Total number of experts
+        top_k (int): Number of experts to route to per token
+        n_group (Optional[int]): Number of expert groups (can be None for some routing methods)
+        topk_group (Optional[int]): Number of groups to consider for top-k routing (can be None for some routing methods)
+        intermediate_size (int): Size of intermediate layer
+        local_expert_offset (int): Offset of local experts in global expert space
+        local_num_experts (int): Number of experts handled by this device
+        routed_scaling_factor (Optional[float]): Scaling factor for routing (can be None for some routing methods)
+        routing_method_type (int): Type of routing method to use (default: 0)
+            - 0: Default (Softmax -> TopK)
+            - 1: Renormalize (TopK -> Softmax)
+            - 2: DeepSeekV3 (Sigmoid -> RoutingBiasAdd -> Top2 in group -> Top4 groups -> Top8 experts)
+            - 3: Llama4 (Top1 -> Sigmoid)
+            - 4: RenormalizeNaive (Softmax -> TopK -> Renormalize)
+        enable_pdl (Optional[bool]): Whether to enable Programmatic Dependent Launch (PDL). Auto-enabled for >= sm90.
+        tune_max_num_tokens(int): Maximum number of tokens for tuning. (default: 8192)
+        output (Optional[torch.Tensor]): shape [seq_len, hidden_size]
+            Optional inplace output tensor.
+    Returns:
+        torch.Tensor: returns the final MoE output.
+    """
+    return get_trtllm_moe_sm100_module().trtllm_mxint4_block_scale_moe(
+        routing_logits,
+        None,
+        hidden_states,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm1_alpha,
+        gemm1_beta,
+        gemm1_clamp_limit,
+        gemm2_weights,
+        gemm2_weights_scale,
+        num_experts,
+        top_k,
+        n_group,
+        topk_group,
+        intermediate_size,
+        local_expert_offset,
+        local_num_experts,
+        routed_scaling_factor,
+        routing_method_type,
+        enable_pdl,
+        output,
+        tune_max_num_tokens,
+    )
diff --git a/flashinfer/fused_moe/fused_routing_dsv3.py b/flashinfer/fused_moe/fused_routing_dsv3.py
new file mode 100644
index 0000000000..9c6fb79c91
--- /dev/null
+++ b/flashinfer/fused_moe/fused_routing_dsv3.py
@@ -0,0 +1,194 @@
+from flashinfer.jit import gen_dsv3_fused_routing_module
+import functools
+from types import SimpleNamespace
+import torch
+from flashinfer.utils import (
+    register_custom_op,
+    supported_compute_capability,
+    backend_requirement,
+)
+
+
+@supported_compute_capability([89, 90, 100, 103, 120, 121])
+def _check_dsv3_fused_routing_supported(
+    scores,
+    bias,
+    n_group,
+    topk_group,
+    topk,
+    routed_scaling_factor,
+    topk_values,
+    topk_indices,
+    launch_with_pdl,
+):
+    """Validate configuration parameters for DSv3 fused routing kernel.
+
+    Args:
+        scores: Input routing scores tensor
+        bias: Per-expert routing bias tensor
+        n_group: Number of expert groups
+        topk_group: Number of top groups to select
+        topk: Number of top experts to select per token
+        routed_scaling_factor: Scaling factor for normalized weights
+        topk_values: Output tensor for normalized expert weights
+        topk_indices: Output tensor for selected expert indices
+        launch_with_pdl: Whether to use Persistent Device-side Launch
+
+    Raises:
+        ValueError: If configuration is invalid or exceeds kernel limits
+    """
+    # Extract number of experts from scores shape
+    num_experts = scores.shape[1]
+
+    # Check basic configuration constraints
+    if topk_group * n_group < topk or topk_group > n_group:
+        raise ValueError(
+            f"Invalid configuration: topk_group * n_group ({topk_group * n_group}) must be >= topk ({topk}) "
+            f"and topk_group ({topk_group}) must be <= n_group ({n_group})"
+        )
+
+    # Check kernel limits based on number of groups
+    if n_group > 1:
+        experts_per_group = num_experts / n_group
+        max_experts_in_selected_groups = experts_per_group * topk_group
+
+        if topk > 8:
+            raise ValueError(
+                f"Invalid configuration for n_group > 1: topk ({topk}) must be <= 8"
+            )
+        if experts_per_group > 32:
+            raise ValueError(
+                f"Invalid configuration for n_group > 1: num_experts / n_group "
+                f"({experts_per_group}) must be <= 32"
+            )
+        if max_experts_in_selected_groups > 128:
+            raise ValueError(
+                f"Invalid configuration for n_group > 1: num_experts / n_group * topk_group "
+                f"({max_experts_in_selected_groups}) must be <= 128"
+            )
+    else:  # n_group == 1
+        if num_experts > 384:
+            raise ValueError(
+                f"Invalid configuration for n_group = 1: num_experts ({num_experts}) must be <= 384"
+            )
+        if topk > 8:
+            raise ValueError(
+                f"Invalid configuration for n_group = 1: topk ({topk}) must be <= 8"
+            )
+
+    return True
+
+
+@functools.cache
+def get_dsv3_fused_routing_module():
+    module = gen_dsv3_fused_routing_module().build_and_load()
+
+    @register_custom_op(
+        "flashinfer::NoAuxTc",
+        mutates_args=["topk_values", "topk_indices"],
+    )
+    def NoAuxTc(
+        scores: torch.Tensor,
+        bias: torch.Tensor,
+        n_group: int,
+        topk_group: int,
+        topk: int,
+        routed_scaling_factor: float,
+        topk_values: torch.Tensor,
+        topk_indices: torch.Tensor,
+        launch_with_pdl: bool = True,
+    ) -> None:
+        module.NoAuxTc(
+            scores,
+            bias,
+            n_group,
+            topk_group,
+            topk,
+            routed_scaling_factor,
+            topk_values,
+            topk_indices,
+            launch_with_pdl,
+        )
+
+    return SimpleNamespace(
+        NoAuxTc=NoAuxTc,
+    )
+
+
+@backend_requirement({}, common_check=_check_dsv3_fused_routing_supported)
+def fused_topk_deepseek(
+    scores: torch.Tensor,
+    bias: torch.Tensor,
+    n_group: int,
+    topk_group: int,
+    topk: int,
+    routed_scaling_factor: float,
+    topk_values: torch.Tensor,
+    topk_indices: torch.Tensor,
+    launch_with_pdl: bool = True,
+) -> None:
+    """Fused expert routing with top-k selection for DeepSeek-V3.
+
+    This function performs a highly optimized fused routing operation specifically
+    designed for DeepSeek-V3's Mixture of Experts (MoE) architecture with grouped
+    expert routing and no auxiliary loss. It combines score computation, expert
+    selection, and normalization into a single kernel operation.
+
+    The routing algorithm consists of the following steps:
+    1. Compute biased scores: sigmoid(scores) + bias for each expert
+    2. Group experts and compute group scores (sum of top-2 experts per group)
+    3. Select top-k groups based on group scores
+    4. From selected groups, select top-k experts based on biased scores
+    5. Normalize selected expert weights: sigmoid_scores / sum(sigmoid_scores) * scale
+
+    Args:
+        scores (torch.Tensor): Input routing scores of shape (num_tokens, num_experts).
+            The logits produced by the router network before activation. Supports
+            bfloat16, float16, or float32.
+        bias (torch.Tensor): Per-expert routing bias of shape (num_experts,). Added to
+            sigmoid-activated scores to produce biased scores for expert selection.
+            Must match the dtype of scores.
+        n_group (int): Number of expert groups. Experts are divided into groups for
+            hierarchical selection. Typical value is 8 for DeepSeek-V3 with 256 experts
+            (32 experts per group).
+        topk_group (int): Number of top groups to select. Must be <= n_group. Typical
+            value is 4, meaning the top 4 groups are selected from 8 groups.
+        topk (int): Number of top experts to select per token. Must be <= num_experts.
+            Typical value is 8, meaning 8 experts are routed per token.
+        routed_scaling_factor (float): Scaling factor applied to normalized expert
+            weights. The final output weights are:
+            sigmoid_scores / sum(sigmoid_scores) * routed_scaling_factor.
+        topk_values (torch.Tensor): Pre-allocated output tensor of shape
+            (num_tokens, topk) for the normalized expert weights. Must be float32.
+            This tensor is mutated in-place.
+        topk_indices (torch.Tensor): Pre-allocated output tensor of shape
+            (num_tokens, topk) for the selected expert indices. Must be int32 or int64.
+            This tensor is mutated in-place.
+        launch_with_pdl (bool, optional): Whether to launch the kernel using Persistent
+            Device-side Launch. Defaults to True.
+
+    Returns:
+        None: Results are written directly to `topk_values` and `topk_indices` tensors.
+
+    Note:
+        - The kernel uses float32 internally for all computations to ensure numerical
+          precision, even when inputs are float16 or bfloat16.
+        - This implementation is optimized for Hopper (compute capability 90, 100),
+          Ada (compute capability 89), and Blackwell (compute capability 120, 121)
+          architectures.
+        - The "NoAux" prefix indicates this variant does not compute auxiliary losses
+          (e.g., load balancing loss) during routing.
+        - The "Tc" suffix indicates the use of Tensor Core optimizations in the
+          underlying CUDA kernel.
+    """
+    get_dsv3_fused_routing_module().NoAuxTc(
+        scores,
+        bias,
+        n_group,
+        topk_group,
+        topk,
+        routed_scaling_factor,
+        topk_values,
+        topk_indices,
+        launch_with_pdl,
+    )
diff --git a/flashinfer/gemm/__init__.py b/flashinfer/gemm/__init__.py
new file mode 100644
index 0000000000..15652268ba
--- /dev/null
+++ b/flashinfer/gemm/__init__.py
@@ -0,0 +1,34 @@
+from .gemm_base import SegmentGEMMWrapper as SegmentGEMMWrapper
+from .gemm_base import bmm_fp8 as bmm_fp8
+from .gemm_base import mm_fp4 as mm_fp4
+from .gemm_base import mm_fp8 as mm_fp8
+from .gemm_base import tgv_gemm_sm100 as tgv_gemm_sm100
+from .gemm_base import group_gemm_mxfp4_nt_groupwise as group_gemm_mxfp4_nt_groupwise
+from .gemm_base import (
+    batch_deepgemm_fp8_nt_groupwise as batch_deepgemm_fp8_nt_groupwise,
+)
+from .gemm_base import (
+    group_deepgemm_fp8_nt_groupwise as group_deepgemm_fp8_nt_groupwise,
+)
+from .gemm_base import gemm_fp8_nt_blockscaled as gemm_fp8_nt_blockscaled
+from .gemm_base import gemm_fp8_nt_groupwise as gemm_fp8_nt_groupwise
+from .gemm_base import group_gemm_fp8_nt_groupwise as group_gemm_fp8_nt_groupwise
+
+from .routergemm_dsv3 import (
+    mm_M1_16_K7168_N256 as mm_M1_16_K7168_N256,
+)
+
+__all__ = [
+    "SegmentGEMMWrapper",
+    "bmm_fp8",
+    "mm_fp4",
+    "mm_fp8",
+    "tgv_gemm_sm100",
+    "group_gemm_mxfp4_nt_groupwise",
+    "batch_deepgemm_fp8_nt_groupwise",
+    "group_deepgemm_fp8_nt_groupwise",
+    "gemm_fp8_nt_blockscaled",
+    "gemm_fp8_nt_groupwise",
+    "group_gemm_fp8_nt_groupwise",
+    "mm_M1_16_K7168_N256",
+]
diff --git a/flashinfer/gemm.py b/flashinfer/gemm/gemm_base.py
similarity index 75%
rename from flashinfer/gemm.py
rename to flashinfer/gemm/gemm_base.py
index 63a2f7e211..305295c747 100644
--- a/flashinfer/gemm.py
+++ b/flashinfer/gemm/gemm_base.py
@@ -22,7 +22,8 @@
 from flashinfer.trtllm_low_latency_gemm import trtllm_low_latency_gemm
 import torch
 
-from .autotuner import (
+from ..api_logging import flashinfer_api
+from ..autotuner import (
     AutoTuner,
     ConstraintSpec,
     DynamicTensorSpec,
@@ -30,11 +31,11 @@
     TunableRunner,
     TuningConfig,
 )
-from .fused_moe.utils import (
+from ..fused_moe.utils import (
     get_last_power_of_2_num_tokens_buckets,
     last_positive_power_of_2,
 )
-from .utils import (
+from ..utils import (
     get_native_fp4_dtype,
     is_sm100a_supported,
     is_sm100f_supported,
@@ -44,16 +45,17 @@
     backend_requirement,
     supported_compute_capability,
 )
-from .jit.gemm import gen_gemm_sm90_module
-from .jit.gemm import gen_gemm_module
-from .jit.gemm import gen_gemm_sm100_module
-from .jit.gemm import gen_gemm_sm120_module
-from .jit.gemm import gen_gemm_sm120_module_cutlass_fp4
-from .jit.gemm import gen_gemm_sm100_module_cutlass_fp4
-from .jit.gemm import gen_gemm_sm100_module_cutlass_fp8
-from .jit.gemm import gen_trtllm_gen_gemm_module
-from .jit.gemm import gen_tgv_gemm_sm10x_module
-from .jit.gemm import gen_deepgemm_sm100_module
+from ..jit.gemm import gen_gemm_sm90_module
+from ..jit.gemm import gen_gemm_module
+from ..jit.gemm import gen_gemm_sm100_module
+from ..jit.gemm import gen_gemm_sm120_module
+from ..jit.gemm import gen_gemm_sm120_module_cutlass_fp4
+from ..jit.gemm import gen_gemm_sm100_module_cutlass_fp4
+from ..jit.gemm import gen_gemm_sm100_module_cutlass_fp8
+from ..jit.gemm import gen_trtllm_gen_gemm_module
+from ..jit.gemm import gen_tgv_gemm_sm10x_module
+from ..jit.gemm import gen_deepgemm_sm100_module
+from ..jit.cpp_ext import get_cuda_version
 
 
 CUDNN_AVAILABLE = False
@@ -70,8 +72,8 @@
         raise
 
 
-from .jit.cubin_loader import setup_cubin_loader
-from .utils import (
+from ..jit.cubin_loader import setup_cubin_loader
+from ..utils import (
     _get_cache_buf,
     determine_gemm_backend,
     get_indptr,
@@ -259,8 +261,10 @@ def forward(
                     scale_k_count = (
                         k_dim + scale_gran_k - 1
                     ) // scale_gran_k  # k dimension
-                    scale_a_expanded = scale_a.view(1, 1).expand(
-                        scale_m_count, scale_k_count
+                    scale_a_expanded = (
+                        scale_a.view(1, 1)
+                        .expand(scale_m_count, scale_k_count)
+                        .contiguous()
                     )
                 else:
                     scale_a_expanded = scale_a
@@ -273,8 +277,10 @@ def forward(
                     scale_k_count = (
                         k_dim + scale_gran_k - 1
                     ) // scale_gran_k  # k dimension
-                    scale_b_expanded = scale_b.view(1, 1).expand(
-                        scale_n_count, scale_k_count
+                    scale_b_expanded = (
+                        scale_b.view(1, 1)
+                        .expand(scale_n_count, scale_k_count)
+                        .contiguous()
                     )
                 else:
                     scale_b_expanded = scale_b
@@ -350,6 +356,25 @@ def forward(
     )
 
 
+_FP8_GEMM_SM100_TUNING_CONFIG = TuningConfig(
+    dynamic_tensor_specs=(
+        DynamicTensorSpec(
+            (0,),  # a_tensor_index
+            (-2,),
+            get_last_power_of_2_num_tokens_buckets,
+            last_positive_power_of_2,
+        ),
+    ),
+    constraint_specs=(
+        ConstraintSpec(
+            4,  # out_tensor_index
+            -2,
+            lambda shapes: shapes[0][-2],
+        ),
+    ),
+)
+
+
 def fp8_gemm_sm100(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -360,55 +385,22 @@ def fp8_gemm_sm100(
     runner_names: List[str],
 ) -> None:
     runners = []
-    # No e5m2 for cutlass
-    is_e5m2 = a.dtype == torch.float8_e5m2 or b.dtype == torch.float8_e5m2
-    is_sm_supported = _match_sm_version(a.device, ["100", "103", "110"])
-    is_sm120_supported = _match_sm_version(a.device, ["120", "121"])
-
-    if "cutlass" in runner_names and not is_e5m2:
-        if is_sm_supported:
-            runners.append(
-                get_gemm_sm100_module_cutlass_fp8().cutlass_fp8_gemm_runner()
-            )
-        elif is_sm120_supported:
-            k_dim = a.shape[-1] if a.dim() == 2 else a.shape[2]
-            if k_dim >= 128:
-                runners.append(
-                    get_gemm_sm120_module_cutlass_fp8().cutlass_fp8_gemm_runner()
-                )
+    if "cutlass_sm10x" in runner_names:
+        runners.append(get_gemm_sm100_module_cutlass_fp8().cutlass_fp8_gemm_runner())
+    if "cutlass_sm12x" in runner_names:
+        runners.append(get_gemm_sm120_module_cutlass_fp8().cutlass_fp8_gemm_runner())
     if "cublas" in runner_names:
         runners.append(get_gemm_module().cublas_fp8_gemm_runner())
-    if CUDNN_AVAILABLE and "cudnn" in runner_names:
+    if "cudnn" in runner_names:
         runners.append(_cudnn_gemm_fp8_runner())
-
-    if len(runners) == 0:
-        major, minor = get_compute_capability(torch.device("cuda"))
-        raise ValueError(f"No valid runner found for current device sm{major}{minor}")
-
+    assert runners, "No suitable runners found"
     tuner = AutoTuner.get()
-    a_tensor_index = 0
-    out_tensor_index = 4
-    tuning_config = TuningConfig(
-        dynamic_tensor_specs=(
-            DynamicTensorSpec(
-                (a_tensor_index,),
-                (-2,),
-                get_last_power_of_2_num_tokens_buckets,
-                last_positive_power_of_2,
-            ),
-        ),
-        constraint_specs=(
-            ConstraintSpec(
-                out_tensor_index, -2, lambda shapes: shapes[a_tensor_index][-2]
-            ),
-        ),
-    )
 
     inputs = [a, b, scale_a, scale_b, out, workspace_buffer]
     runner, tactic = tuner.choose_one(
         "fp8_gemm",
         runners,
-        tuning_config,
+        _FP8_GEMM_SM100_TUNING_CONFIG,
         inputs,
     )
 
@@ -418,87 +410,50 @@ def fp8_gemm_sm100(
 def _create_cutlass_fp4_gemm_module(module, op_name: str, tuner_name: str):
     """Helper function to create cutlass FP4 GEMM module."""
 
-    class CutlassFp4GemmRunner(TunableRunner):
-        def __init__(self):
-            self._fp4_gemm_runner = module.fp4_gemm
-
-        def get_valid_tactics(
-            self,
-            inputs: List[torch.Tensor],
-            profile: OptimizationProfile,
-        ) -> List[int]:
-            return list(range(module.fp4_gemm_tactic_num()))
-
-        def forward(
-            self,
-            inputs: List[torch.Tensor],
-            tactic: int = -1,
-            do_preparation: bool = False,
-            **kwargs,
-        ):
-            a, b, a_descale, b_descale, alpha, out, workspace_buffer = inputs
-            module.fp4_gemm(
-                a, b, a_descale, b_descale, alpha, out, workspace_buffer, tactic
-            )
-            return out
-
-    @register_custom_op(
-        op_name,
-        mutates_args=(""),
-    )
-    def cutlass_fp4_gemm(
-        a: torch.Tensor,
-        b: torch.Tensor,
-        a_descale: torch.Tensor,
-        b_descale: torch.Tensor,
-        alpha: torch.Tensor,
-        out: torch.Tensor,
-        workspace_buffer: torch.Tensor,
-    ):
-        tuner = AutoTuner.get()
-
-        a_tensor_index = 0
-        a_scale_tensor_index = 2
-        out_tensor_index = 5
-
-        def pad_up(x, y):
-            return ((x + y - 1) // y) * y
-
-        tuning_config = TuningConfig(
-            dynamic_tensor_specs=(
-                DynamicTensorSpec(
-                    (a_tensor_index,),
-                    (0,),
-                    get_last_power_of_2_num_tokens_buckets,
-                    last_positive_power_of_2,
-                ),
-            ),
-            constraint_specs=(
-                ConstraintSpec(
-                    a_scale_tensor_index,
-                    0,
-                    lambda shapes: pad_up(shapes[a_tensor_index][0], 128),
-                ),
-                ConstraintSpec(
-                    out_tensor_index, 0, lambda shapes: shapes[a_tensor_index][0]
-                ),
-            ),
-        )
+    def cutlass_fp4_gemm_runner():
+        class CutlassFp4GemmRunner(TunableRunner):
+            def __init__(self):
+                self._fp4_gemm_runner = module.fp4_gemm
 
-        fp4_runner = CutlassFp4GemmRunner()
+            def get_valid_tactics(
+                self,
+                inputs: List[torch.Tensor],
+                profile: OptimizationProfile,
+            ) -> List[int]:
+                return list(range(module.fp4_gemm_tactic_num()))
 
-        inputs = [a, b, a_descale, b_descale, alpha, out, workspace_buffer]
-        _, tactic = tuner.choose_one(
-            tuner_name,
-            [fp4_runner],
-            tuning_config,
-            inputs,
-        )
+            def forward(
+                self,
+                inputs: List[torch.Tensor],
+                tactic: int = -1,
+                do_preparation: bool = False,
+                **kwargs,
+            ):
+                (
+                    a,
+                    b,
+                    a_descale,
+                    b_descale,
+                    alpha,
+                    _,
+                    out,
+                    _,
+                    _,
+                    workspace_buffer,
+                ) = inputs
+                if a.dtype == torch.uint8 and a_descale.dtype == torch.float8_e4m3fn:
+                    a_descale = a_descale.view(torch.uint8)
+                if b.dtype == torch.uint8 and b_descale.dtype == torch.float8_e4m3fn:
+                    b_descale = b_descale.view(torch.uint8)
+                module.fp4_gemm(
+                    a, b.T, a_descale, b_descale.T, alpha, out, workspace_buffer, tactic
+                )
+                return out
 
-        fp4_runner(inputs=inputs, tactic=tactic)
+        return CutlassFp4GemmRunner()
 
     return SimpleNamespace(
-        cutlass_fp4_gemm=cutlass_fp4_gemm,
+        cutlass_fp4_gemm_runner=cutlass_fp4_gemm_runner,
     )
 
 
@@ -520,6 +475,17 @@ def get_gemm_sm120_module_cutlass_fp4():
     )
 
 
+def get_cutlass_fp4_gemm_module(
+    sm_major: int,
+):
+    if sm_major in [10, 11]:
+        return get_gemm_sm100_module_cutlass_fp4()
+    elif sm_major == 12:
+        return get_gemm_sm120_module_cutlass_fp4()
+    else:
+        raise ValueError(f"Unsupported SM major version: {sm_major}")
+
+
 @functools.cache
 def get_tgv_gemm_sm10x_module(
     dtype: torch.dtype = torch.bfloat16, use_sm_100f: bool = False
@@ -576,6 +542,7 @@ def forward(
     )
 
 
+@flashinfer_api
 def tgv_gemm_sm100(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -745,7 +712,7 @@ def launch_compute_sm80_group_gemm_args(
     w_stride_data = torch.empty(batch_size, dtype=ld_type, device=device)
     y_stride_data = torch.empty(batch_size, dtype=ld_type, device=device)
 
-    from .triton.gemm import compute_sm80_group_gemm_args
+    from ..triton.gemm import compute_sm80_group_gemm_args
 
     compute_sm80_group_gemm_args[(batch_size,)](
         all_problems,
@@ -807,7 +774,7 @@ def launch_compute_sm90_group_gemm_args(
     w_stride_data = torch.empty(batch_size, dtype=stride_type, device=device)
     y_stride_data = torch.empty(batch_size, dtype=stride_type, device=device)
 
-    from .triton.gemm import compute_sm90_group_gemm_args
+    from ..triton.gemm import compute_sm90_group_gemm_args
 
     compute_sm90_group_gemm_args[(batch_size,)](
         all_problems,
@@ -921,6 +888,7 @@ def reset_workspace_buffer(
         self._float_workspace_buffer = float_workspace_buffer
         self._int_workspace_buffer = int_workspace_buffer
 
+    @flashinfer_api
     def run(
         self,
         x: torch.Tensor,
@@ -1151,7 +1119,6 @@ def _check_cudnn_fp4_availability():
 
 def _is_cublas_fp4_available_in_cudnn():
     """Check if cuBLAS backend for FP4 GEMM is available in cuDNN."""
-    _check_cudnn_availability()
 
     # Check cuDNN backend version for FP4 support (requires cudnn_version == 9.11.1 or cudnn_version >= 9.13)
     backend_version = cudnn.backend_version()
@@ -1203,7 +1170,6 @@ def create_cudnn_execution_plans_fp4_gemm(
     alpha_is_not_none,
     use_nvfp4,
 ):
-    _check_cudnn_availability()
     stream = torch.cuda.current_stream(device)
     with cudnn.graph(_get_cudnn_handle(stream)) as (graph, _):
         scale_type = cudnn.data_type.FP8_E4M3 if use_nvfp4 else cudnn.data_type.FP8_E8M0
@@ -1304,7 +1270,9 @@ def build_plans_cudnn_fp4_gemm_graph(
     device,
     alpha,
     use_nvfp4,
+    tactic: int = -1,
 ):
+    # Graph should have been already cached, when we ran _cudnn_gemm_fp4_requirement
     graph = create_cudnn_execution_plans_fp4_gemm(
         a_shape,
         a_stride,
@@ -1323,7 +1291,10 @@ def build_plans_cudnn_fp4_gemm_graph(
     )
 
     graph.check_support()
-    graph.build_plans()
+    if tactic != -1:
+        graph.build_plan_at_index(tactic)
+    else:
+        graph.build_plans()
     return graph
 
 
@@ -1336,6 +1307,7 @@ def execute_cudnn_gemm_fp4_graph(
     alpha,
     c_final,
     workspace_buffer,
+    tactic: int = -1,
 ):
     variant_pack = {
         UIDs.A_UID.value: a.view(get_native_fp4_dtype()),
@@ -1355,7 +1327,12 @@ def execute_cudnn_gemm_fp4_graph(
 
     stream = torch.cuda.current_stream(a.device)
 
-    graph.execute(variant_pack, workspace_buffer, handle=_get_cudnn_handle(stream))
+    if tactic == -1:
+        graph.execute(variant_pack, workspace_buffer, handle=_get_cudnn_handle(stream))
+    else:
+        graph.execute_plan_at_index(
+            variant_pack, workspace_buffer, tactic, handle=_get_cudnn_handle(stream)
+        )
 
 
 @functools.cache
@@ -1579,6 +1556,7 @@ def _expand_block_scale_tensor_shape(block_scale_tensor, batch_size):
     return (tuple(block_scale_shape), tuple(block_scale_stride))
 
 
+@flashinfer_api
 def mm_fp8(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -1689,7 +1667,7 @@ def mm_fp8(
     return out
 
 
-def _check_mm_fp4_problem_size(
+def _get_cudnn_fp4_gemm_graph(
     a: torch.Tensor,
     b: torch.Tensor,
     a_descale: torch.Tensor,
@@ -1698,8 +1676,161 @@ def _check_mm_fp4_problem_size(
     out_dtype: torch.dtype = torch.bfloat16,
     out: Optional[torch.Tensor] = None,
     block_size: int = 16,
-    use_8x4_sf_layout: bool = False,
-    backend: Literal["cudnn", "trtllm", "cutlass"] = "cudnn",
+    use_nvfp4: bool = True,
+    tactic: int = -1,
+):
+    # the fp4 cudnn graph will be shared for both mm and bmm, so
+    # here we need to get the 3d shape and stride including the
+    # batch dimension for both input and block scale tensors.
+    real_a_shape, real_a_stride = _get_real_fp4_shape_from_packed_uint8(a)
+    real_b_shape, real_b_stride = _get_real_fp4_shape_from_packed_uint8(b)
+    batch = real_a_shape[0]
+    expanded_a_descale_shape, expanded_a_descale_stride = (
+        _expand_block_scale_tensor_shape(a_descale, batch)
+    )
+    expanded_b_descale_shape, expanded_b_descale_stride = (
+        _expand_block_scale_tensor_shape(b_descale, batch)
+    )
+
+    # build the fp4 cudnn graph
+    # Constructed graph is cached, via @functools.cache decorator.
+    graph = build_plans_cudnn_fp4_gemm_graph(
+        real_a_shape,
+        real_a_stride,
+        real_b_shape,
+        real_b_stride,
+        expanded_a_descale_shape,
+        expanded_a_descale_stride,
+        expanded_b_descale_shape,
+        expanded_b_descale_stride,
+        cudnn.data_type.FP4_E2M1,
+        _torch_data_type_to_cudnn_data_type(out_dtype),
+        block_size,
+        a.device,
+        alpha is not None,
+        use_nvfp4,
+        tactic=tactic,
+    )
+    return graph
+
+
+def _cudnn_gemm_fp4(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_descale: torch.Tensor,
+    b_descale: torch.Tensor,
+    alpha: Optional[torch.Tensor] = None,
+    out_dtype: torch.dtype = torch.bfloat16,
+    out: Optional[torch.Tensor] = None,
+    block_size: int = 16,
+    use_nvfp4: bool = True,
+    workspace_buffer: torch.Tensor = None,
+    tactic: int = -1,
+):
+    # Graph should have been already cached, when we ran _cudnn_gemm_fp4_requirement
+    graph = _get_cudnn_fp4_gemm_graph(
+        a=a,
+        b=b,
+        a_descale=a_descale,
+        b_descale=b_descale,
+        alpha=alpha,
+        out_dtype=out_dtype,
+        out=out,
+        block_size=block_size,
+        use_nvfp4=use_nvfp4,
+        tactic=tactic,
+    )
+    # execute the fp4 cudnn graph
+    execute_cudnn_gemm_fp4_graph(
+        graph, a, b, a_descale, b_descale, alpha, out, workspace_buffer, tactic=tactic
+    )
+
+
+def _cudnn_gemm_fp4_runner():
+    class CudnnFp4GemmRunner(TunableRunner):
+        def get_valid_tactics(
+            self,
+            inputs: List[torch.Tensor],
+            profile: OptimizationProfile,
+        ) -> List[int]:
+            # cudnn has heuristic for fp4 gemm, so we only need to use the default tactic
+            (
+                a,
+                b,
+                a_descale,
+                b_descale,
+                alpha,
+                out_dtype,
+                out,
+                block_size,
+                use_nvfp4,
+                workspace_buffer,
+            ) = inputs
+
+            # Graph should have been already cached, when we ran _cudnn_gemm_fp4_requirement
+            graph = _get_cudnn_fp4_gemm_graph(
+                a=a,
+                b=b,
+                a_descale=a_descale,
+                b_descale=b_descale,
+                alpha=alpha,
+                out_dtype=out_dtype,
+                out=out,
+                block_size=block_size,
+                use_nvfp4=use_nvfp4,
+                tactic=-1,
+            )
+
+            num_plans = graph.get_execution_plan_count()
+            return list(range(num_plans))
+
+        def forward(
+            self,
+            inputs: List[torch.Tensor],
+            tactic: int = -1,
+            do_preparation: bool = False,
+            **kwargs,
+        ) -> torch.Tensor:
+            (
+                a,
+                b,
+                a_descale,
+                b_descale,
+                alpha,
+                out_dtype,
+                out,
+                block_size,
+                use_nvfp4,
+                workspace_buffer,
+            ) = inputs
+            _cudnn_gemm_fp4(
+                a,
+                b,
+                a_descale,
+                b_descale,
+                alpha,
+                out_dtype,
+                out,
+                block_size,
+                use_nvfp4,
+                workspace_buffer,
+                tactic=tactic,
+            )
+
+    return CudnnFp4GemmRunner()
+
+
+def _check_mm_fp4_problem_size(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_descale: torch.Tensor,
+    b_descale: torch.Tensor,
+    alpha: Optional[torch.Tensor] = None,
+    out_dtype: torch.dtype = torch.bfloat16,
+    out: Optional[torch.Tensor] = None,  # unused
+    block_size: int = 16,
+    use_8x4_sf_layout: bool = False,  # unused
+    backend: Literal["cudnn", "trtllm", "cutlass", "auto"] = "auto",  # unused
     use_nvfp4: bool = True,
 ):
     # Generic checks
@@ -1737,11 +1868,6 @@ def _check_mm_fp4_problem_size(
             f"Only torch.bfloat16 and torch.float16 are supported for FP4 GEMM operations."
         )
 
-    if backend != "trtllm" and use_8x4_sf_layout:
-        raise ValueError("Only TRTLLM FP4 GEMM supports 8x4 scale factor layout.")
-    if backend != "cudnn" and not use_nvfp4:
-        raise ValueError("Only cudnn FP4 GEMM supports mxfp4 quantization.")
-
     if use_nvfp4 and block_size != 16:
         raise ValueError("nvfp4 only supports block_size = 16.")
     if not use_nvfp4 and block_size != 32:
@@ -1750,7 +1876,7 @@ def _check_mm_fp4_problem_size(
     return True
 
 
-@supported_compute_capability([100, 103, 110, 120])
+@supported_compute_capability([100, 103, 110, 120, 121])
 def _cudnn_gemm_fp4_requirement(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -1758,12 +1884,14 @@ def _cudnn_gemm_fp4_requirement(
     b_descale: torch.Tensor,
     alpha: Optional[torch.Tensor] = None,
     out_dtype: torch.dtype = torch.bfloat16,
-    out: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,  # unused
     block_size: int = 16,
     use_8x4_sf_layout: bool = False,
-    backend: Literal["cudnn", "trtllm", "cutlass"] = "cudnn",
+    backend: Literal["cudnn", "trtllm", "cutlass", "auto"] = "auto",  # unused
     use_nvfp4: bool = True,
 ):
+    if use_8x4_sf_layout:
+        raise ValueError("Only TRTLLM FP4 GEMM supports 8x4 scale factor layout.")
     if (
         not use_nvfp4
         and _match_sm_version(a.device, ["120"])
@@ -1786,7 +1914,8 @@ def _cudnn_gemm_fp4_requirement(
         _expand_block_scale_tensor_shape(b_descale, batch)
     )
 
-    # build the fp4 cudnn graph
+    # build the fp4 cudnn graph. This graph will be cached & reused in mm_fp4()
+    # because the graph is constructed with @functools.cache decorator
     graph = create_cudnn_execution_plans_fp4_gemm(
         real_a_shape,
         real_a_stride,
@@ -1808,20 +1937,22 @@ def _cudnn_gemm_fp4_requirement(
     return True
 
 
-@supported_compute_capability([100, 103, 120])
+@supported_compute_capability([100, 103])
 def _trtllm_gemm_fp4_requirement(
-    a: torch.Tensor,
-    b: torch.Tensor,
-    a_descale: torch.Tensor,
-    b_descale: torch.Tensor,
-    alpha: Optional[torch.Tensor] = None,
+    a: torch.Tensor,  # unused
+    b: torch.Tensor,  # unused
+    a_descale: torch.Tensor,  # unused
+    b_descale: torch.Tensor,  # unused
+    alpha: Optional[torch.Tensor] = None,  # unused
     out_dtype: torch.dtype = torch.bfloat16,
-    out: Optional[torch.Tensor] = None,
-    block_size: int = 16,
-    use_8x4_sf_layout: bool = False,
-    backend: Literal["cudnn", "trtllm", "cutlass"] = "cudnn",
+    out: Optional[torch.Tensor] = None,  # unused
+    block_size: int = 16,  # unused
+    use_8x4_sf_layout: bool = False,  # unused
+    backend: Literal["cudnn", "trtllm", "cutlass", "auto"] = "auto",  # unused
     use_nvfp4: bool = True,
 ):
+    if not use_nvfp4:
+        raise ValueError("Only cudnn and auto FP4 GEMM supports mxfp4 quantization.")
     if out_dtype != torch.bfloat16:
         raise ValueError(
             f"Unsupported output dtype: {out_dtype}. "
@@ -1830,8 +1961,29 @@ def _trtllm_gemm_fp4_requirement(
     return True
 
 
-@supported_compute_capability([100, 103, 120])
+@supported_compute_capability([100, 103, 110, 120, 121])
 def _cutlass_gemm_fp4_requirement(
+    a: torch.Tensor,  # unused
+    b: torch.Tensor,  # unused
+    a_descale: torch.Tensor,  # unused
+    b_descale: torch.Tensor,  # unused
+    alpha: Optional[torch.Tensor] = None,  # unused
+    out_dtype: torch.dtype = torch.bfloat16,  # unused
+    out: Optional[torch.Tensor] = None,  # unused
+    block_size: int = 16,  # unused
+    use_8x4_sf_layout: bool = False,
+    backend: Literal["cudnn", "trtllm", "cutlass", "auto"] = "auto",  # unused
+    use_nvfp4: bool = True,
+):
+    if use_8x4_sf_layout:
+        raise ValueError("Only TRTLLM FP4 GEMM supports 8x4 scale factor layout.")
+    if not use_nvfp4:
+        raise ValueError("Only cudnn and auto FP4 GEMM supports mxfp4 quantization.")
+    return True
+
+
+def _heuristic_func_mm_fp4(
+    suitable_backends: List[str],
     a: torch.Tensor,
     b: torch.Tensor,
     a_descale: torch.Tensor,
@@ -1841,20 +1993,96 @@ def _cutlass_gemm_fp4_requirement(
     out: Optional[torch.Tensor] = None,
     block_size: int = 16,
     use_8x4_sf_layout: bool = False,
-    backend: Literal["cudnn", "trtllm", "cutlass"] = "cudnn",
+    backend: Literal["cudnn", "trtllm", "cutlass", "auto"] = "cudnn",
     use_nvfp4: bool = True,
 ):
-    return True
+    r"""
+    Heuristic function for mm_fp4 backend selection. Routes to either cudnn or cutlass.
+    Note: trtllm is not considered in the backend selection because it requires a specific
+    input quantization (swizzling/shuffling) that differs from the preparation used
+    for cudnn and cutlass backends.
+
+    Logic for which comes first:
+    - If cuda version is 12 - use cutlass.
+    - If cuda version is 13 and cudnn version is less than 9.15 - use cutlass.
+    - If cuda version is 13 and cudnn version is 9.15 or greater - use cudnn.
+
+    """
+    cuda_major = get_cuda_version().major
+    # If cuda version is 13 or greater:
+    # cudnn is more performant if cudnn version is 9.15 or greater.
+    if CUDNN_AVAILABLE and cuda_major >= 13 and cudnn.backend_version() >= 91500:
+        candidate_backends = ("cudnn", "cutlass")
+    # Otherwise, prioritize cutlass
+    else:
+        candidate_backends = ("cutlass", "cudnn")
+
+    # Filter and return only supported backends
+    return [c for c in candidate_backends if c in suitable_backends]
+
+
+def _pad_up(x, y):
+    return ((x + y - 1) // y) * y
+
+
+_MM_FP4_TUNING_CONFIG_8x4 = TuningConfig(
+    dynamic_tensor_specs=(
+        DynamicTensorSpec(
+            (0,),  # a_tensor_index
+            (0,),
+            get_last_power_of_2_num_tokens_buckets,
+            last_positive_power_of_2,
+        ),
+    ),
+    constraint_specs=(
+        ConstraintSpec(
+            2,  # a_scale_tensor_index
+            0,
+            lambda shapes: _pad_up(shapes[0][0], 8),
+        ),
+        ConstraintSpec(
+            6,  # out_tensor_index
+            0,
+            lambda shapes: shapes[0][0],
+        ),
+    ),
+)
+
+
+_MM_FP4_TUNING_CONFIG_128x4 = TuningConfig(
+    dynamic_tensor_specs=(
+        DynamicTensorSpec(
+            (0,),  # a_tensor_index
+            (0,),
+            get_last_power_of_2_num_tokens_buckets,
+            last_positive_power_of_2,
+        ),
+    ),
+    constraint_specs=(
+        ConstraintSpec(
+            2,  # a_scale_tensor_index
+            0,
+            lambda shapes: _pad_up(shapes[0][0], 128),
+        ),
+        ConstraintSpec(
+            6,  # out_tensor_index
+            0,
+            lambda shapes: shapes[0][0],
+        ),
+    ),
+)
 
 
 @backend_requirement(
     {
-        "cudnn": _cudnn_gemm_fp4_requirement,  # Each backend has its own requirement function
+        "cudnn": _cudnn_gemm_fp4_requirement,
         "trtllm": _trtllm_gemm_fp4_requirement,
         "cutlass": _cutlass_gemm_fp4_requirement,
     },
-    common_check=_check_mm_fp4_problem_size,  # Shape checks common to all backends
+    common_check=_check_mm_fp4_problem_size,
+    heuristic_func=_heuristic_func_mm_fp4,  # result stored in mm_fp4.suitable_auto_backends
 )
+@flashinfer_api
 def mm_fp4(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -1865,7 +2093,7 @@ def mm_fp4(
     out: Optional[torch.Tensor] = None,
     block_size: int = 16,
     use_8x4_sf_layout: bool = False,
-    backend: Literal["cudnn", "trtllm", "cutlass"] = "cudnn",
+    backend: Literal["cudnn", "trtllm", "cutlass", "auto"] = "auto",
     use_nvfp4: bool = True,
 ) -> torch.Tensor:
     r"""MM FP4
@@ -1888,7 +2116,7 @@ def mm_fp4(
         Global scale tensor, float scalar.
 
     out_dtype: torch.dtype
-        Output dtype, bf16 or fp16.
+        Output dtype, bf16 or fp16. When ``backend="trtllm"``, only ``bf16`` is supported.
 
     out: Optional[torch.Tensor]
         Out tensor, shape (m, n), bf16 or fp16, defaults to ``None``.
@@ -1899,11 +2127,15 @@ def mm_fp4(
     use_8x4_sf_layout: bool
         Whether to use 8x4 scale factor layout or 128x4 scale factor layout, defaults to False.
 
-    backend: Literal["cudnn", "trtllm", "cutlass"]
-        Backend to use, defaults to "cudnn".
+    backend: Literal["cudnn", "trtllm", "cutlass", "auto"]
+        Backend to use, defaults to ``"auto"``, which automatically selects the best
+        backend between ``"cudnn"`` and ``"cutlass"`` based on the current CUDA and
+        cuDNN versions. The ``"trtllm"`` backend is never selected when
+        ``backend="auto"`` because it requires different weight preparation.
 
     use_nvfp4: bool
-        Whether to use nvfp4 quantization or mxfp4 quantization, defaults to False.
+        Whether to use nvfp4 quantization or mxfp4 quantization, defaults to ``True``.
+        See the ``block_size`` parameter for related constraints.
 
     Notes
     -----
@@ -1942,74 +2174,57 @@ def mm_fp4(
         "mm_fp4_workspace", DEFAULT_WORKSPACE_SIZE, a.device
     )
 
-    if backend == "cudnn":
-        # the fp4 cudnn graph will be shared for both mm and bmm, so
-        # here we need to get the 3d shape and stride including the
-        # batch dimension for both input and block scale tensors.
-        real_a_shape, real_a_stride = _get_real_fp4_shape_from_packed_uint8(a)
-        real_b_shape, real_b_stride = _get_real_fp4_shape_from_packed_uint8(b)
-        batch = real_a_shape[0]
-        expanded_a_descale_shape, expanded_a_descale_stride = (
-            _expand_block_scale_tensor_shape(a_descale, batch)
-        )
-        expanded_b_descale_shape, expanded_b_descale_stride = (
-            _expand_block_scale_tensor_shape(b_descale, batch)
-        )
+    # Auto-select the best backend
+    if backend == "auto":
+        backends = mm_fp4.suitable_auto_backends
+    else:
+        backends = [backend]
 
-        # build the fp4 cudnn graph
-        graph = build_plans_cudnn_fp4_gemm_graph(
-            real_a_shape,
-            real_a_stride,
-            real_b_shape,
-            real_b_stride,
-            expanded_a_descale_shape,
-            expanded_a_descale_stride,
-            expanded_b_descale_shape,
-            expanded_b_descale_stride,
-            cudnn.data_type.FP4_E2M1,
-            _torch_data_type_to_cudnn_data_type(out_dtype),
-            block_size,
-            a.device,
-            alpha is not None,
-            use_nvfp4,
-        )
+    # At this point, backends contains a supported backend if specified, or all supported backends if backend='auto'.
+    # Lazy initialization of runners to avoid overhead of creating a new runner that will not be used
+    major, _ = get_compute_capability(a.device)
 
-        # execute the fp4 cudnn graph
-        execute_cudnn_gemm_fp4_graph(
-            graph, a, b, a_descale, b_descale, alpha, out, workspace_buffer
-        )
-    elif backend == "trtllm":
-        get_trtllm_fp4_gemm_module().trtllm_fp4_gemm(
-            a,
-            b.T,
-            a_descale,
-            b_descale.T,
-            alpha,
-            out,
-            use_8x4_sf_layout=use_8x4_sf_layout,
-            workspace_buffer=workspace_buffer,
-        )
-    elif backend == "cutlass":
-        # cutlass require uint8 scale when a/b is fp4 packed uint8.
-        if a.dtype == torch.uint8 and a_descale.dtype == torch.float8_e4m3fn:
-            a_descale = a_descale.view(torch.uint8)
-        if b.dtype == torch.uint8 and b_descale.dtype == torch.float8_e4m3fn:
-            b_descale = b_descale.view(torch.uint8)
-
-        # Dispatch to the correct module based on device architecture
-        major, _ = get_compute_capability(a.device)
-        if major == 12:
-            gemm_module = get_gemm_sm120_module_cutlass_fp4()
-        else:
-            gemm_module = get_gemm_sm100_module_cutlass_fp4()
+    backend_to_runner_factory = {
+        "cudnn": lambda: _cudnn_gemm_fp4_runner(),
+        "trtllm": lambda: get_trtllm_fp4_gemm_module().trtllm_fp4_gemm_runner(
+            use_8x4_sf_layout
+        ),
+        "cutlass": lambda: get_cutlass_fp4_gemm_module(major).cutlass_fp4_gemm_runner(),
+    }
+    runners = [backend_to_runner_factory[cur_backend]() for cur_backend in backends]
 
-        gemm_module.cutlass_fp4_gemm(
-            a, b.T, a_descale, b_descale.T, alpha, out, workspace_buffer
-        )
+    # Now we have a list of runners for desired & supported backends.
+    tuner = AutoTuner.get()
+
+    tuning_config = (
+        _MM_FP4_TUNING_CONFIG_8x4 if use_8x4_sf_layout else _MM_FP4_TUNING_CONFIG_128x4
+    )
+
+    inputs = [
+        a,
+        b,
+        a_descale,
+        b_descale,
+        alpha,
+        out_dtype,
+        out,
+        block_size,
+        use_nvfp4,
+        workspace_buffer,
+    ]
+    runner, tactic = tuner.choose_one(
+        "fp4_gemm",
+        runners,
+        tuning_config,
+        inputs,
+    )
+
+    runner(inputs=inputs, tactic=tactic)
     return out
 
 
-def bmm_fp8(
+@supported_compute_capability([89, 90, 100, 103, 120, 121])
+def _cudnn_bmm_fp8_requirement(
     A: torch.Tensor,
     B: torch.Tensor,
     A_scale: torch.Tensor,
@@ -2017,8 +2232,103 @@ def bmm_fp8(
     dtype: torch.dtype,
     out: Optional[torch.Tensor] = None,
     backend: Literal["cudnn", "cublas", "cutlass", "auto"] = "cublas",
-) -> torch.Tensor:
-    r"""BMM FP8
+):
+    _check_cudnn_availability()
+    return True
+
+
+@supported_compute_capability([89, 90, 100, 103, 120, 121])
+def _cublas_bmm_fp8_requirement(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    dtype: torch.dtype,
+    out: Optional[torch.Tensor] = None,
+    backend: Literal["cudnn", "cublas", "cutlass", "auto"] = "cublas",
+):
+    return True
+
+
+@supported_compute_capability([100, 103, 110, 120, 121])
+def _cutlass_bmm_fp8_requirement(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    dtype: torch.dtype,
+    out: Optional[torch.Tensor] = None,
+    backend: Literal["cudnn", "cublas", "cutlass", "auto"] = "cublas",
+):
+    if A.dtype == torch.float8_e5m2 or B.dtype == torch.float8_e5m2:
+        raise ValueError("e5m2 is not supported for bmm_fp8 with cutlass backend")
+    return True
+
+
+def _check_bmm_fp8_problem_size(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    dtype: torch.dtype,
+    out: Optional[torch.Tensor] = None,
+    backend: Literal["cudnn", "cublas", "cutlass", "auto"] = "cublas",
+):
+    _validate_fp8_output_dtype(dtype)
+    return True
+
+
+def _heuristic_func_bmm_fp8(
+    suitable_backends: List[str],
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    dtype: torch.dtype,
+    out: Optional[torch.Tensor] = None,
+    backend: Literal["cudnn", "cublas", "cutlass", "auto"] = "cublas",
+):
+    # No e5m2 for cutlass
+    is_e5m2 = A.dtype == torch.float8_e5m2 or B.dtype == torch.float8_e5m2
+    is_sm_supported = _match_sm_version(A.device, ["100", "103", "110"])
+    is_sm120_supported = _match_sm_version(A.device, ["120", "121"])
+
+    # preserve order of ["cudnn", "cublas", "cutlass"]
+    heuristic_backends = []
+    if "cutlass" in suitable_backends and not is_e5m2:
+        if is_sm_supported:
+            heuristic_backends.append("cutlass_sm10x")
+        elif is_sm120_supported:
+            k_dim = A.shape[-1] if A.dim() == 2 else A.shape[2]
+            if k_dim >= 128:
+                heuristic_backends.append("cutlass_sm12x")
+    if "cublas" in suitable_backends:
+        heuristic_backends.append("cublas")
+    if CUDNN_AVAILABLE and "cudnn" in suitable_backends:
+        heuristic_backends.append("cudnn")
+    return heuristic_backends
+
+
+@backend_requirement(
+    {
+        "cudnn": _cudnn_bmm_fp8_requirement,
+        "cublas": _cublas_bmm_fp8_requirement,
+        "cutlass": _cutlass_bmm_fp8_requirement,
+    },
+    common_check=_check_bmm_fp8_problem_size,
+    heuristic_func=_heuristic_func_bmm_fp8,
+)
+@flashinfer_api
+def bmm_fp8(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    dtype: torch.dtype,
+    out: Optional[torch.Tensor] = None,
+    backend: Literal["cudnn", "cublas", "cutlass", "auto"] = "cublas",
+) -> torch.Tensor:
+    r"""BMM FP8
 
     Parameters
     ----------
@@ -2073,7 +2383,6 @@ def bmm_fp8(
     >>> out.dtype
     torch.bfloat16
     """
-    _validate_fp8_output_dtype(dtype)
 
     if out is None:
         out = torch.empty(
@@ -2086,23 +2395,94 @@ def bmm_fp8(
         "bmm_fp8_workspace", DEFAULT_WORKSPACE_SIZE, A.device
     )
 
-    if backend == "cudnn":
-        backends = ["cudnn"]
-    elif backend == "cublas":
-        backends = ["cublas"]
+    if backend == "auto":
+        backends = bmm_fp8.suitable_auto_backends
     elif backend == "cutlass":
-        if A.dtype == torch.float8_e5m2 or B.dtype == torch.float8_e5m2:
-            raise ValueError("e5m2 is not supported for cutlass backend")
-        backends = ["cutlass"]
-    elif backend == "auto":
-        backends = ["cutlass", "cublas", "cudnn"]
+        backends = _heuristic_func_bmm_fp8(
+            ["cutlass"], A, B, A_scale, B_scale, dtype, out, backend
+        )
+    elif backend == "cudnn" and CUDNN_AVAILABLE:
+        backends = ["cudnn"]
     else:
-        raise ValueError(f"Unsupported backend: {backend}")
+        backends = [backend]
 
     fp8_gemm_sm100(A, B, A_scale, B_scale, out, workspace_buffer, backends)
     return out
 
 
+@supported_compute_capability([100, 103, 120, 121])
+def _cutlass_gemm_fp8_nt_groupwise_requirement(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_scale: torch.Tensor,
+    scale_major_mode: Optional[Literal["MN", "K"]] = None,
+    mma_sm: int = 1,
+    scale_granularity_mnk: Tuple[int, int, int] = (1, 128, 128),
+    out: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    backend: Literal["cutlass", "trtllm"] = "cutlass",
+):
+    if scale_major_mode is None:
+        raise ValueError("scale_major_mode is required in CUTLASS")
+
+    return True
+
+
+@supported_compute_capability([100, 103])
+def _trtllm_gemm_fp8_nt_groupwise_requirement(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_scale: torch.Tensor,
+    scale_major_mode: Optional[Literal["MN", "K"]] = None,
+    mma_sm: int = 1,
+    scale_granularity_mnk: Tuple[int, int, int] = (1, 128, 128),
+    out: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    backend: Literal["cutlass", "trtllm"] = "cutlass",
+):
+    if scale_granularity_mnk != (1, 128, 128):
+        raise ValueError("scale_granularity_mnk must be (1, 128, 128) in TRTLLM")
+    if a.shape[1] < 256:
+        raise ValueError("a.shape[1] must be >= 256 in TRTLLM")
+
+    return True
+
+
+def _check_gemm_fp8_nt_groupwise_problem_size(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_scale: torch.Tensor,
+    scale_major_mode: Optional[Literal["MN", "K"]] = None,
+    mma_sm: int = 1,
+    scale_granularity_mnk: Tuple[int, int, int] = (1, 128, 128),
+    out: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    backend: Literal["cutlass", "trtllm"] = "cutlass",
+):
+    if a.ndim != 2 or b.ndim != 2:
+        raise ValueError(f"Shape mismatch. a.shape = {a.shape}, b.shape = {b.shape}")
+
+    if a.shape[1] != b.shape[1]:
+        raise ValueError(
+            f"Shape mismatch. a.shape[1] = {a.shape[1]}, b.shape[1] = {b.shape[1]}"
+        )
+
+    _validate_fp8_output_dtype(out_dtype)
+
+    return True
+
+
+@backend_requirement(
+    {
+        "cutlass": _cutlass_gemm_fp8_nt_groupwise_requirement,
+        "trtllm": _trtllm_gemm_fp8_nt_groupwise_requirement,
+    },
+    common_check=_check_gemm_fp8_nt_groupwise_problem_size,
+)
+@flashinfer_api
 def gemm_fp8_nt_groupwise(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -2176,27 +2556,16 @@ def gemm_fp8_nt_groupwise(
     -----
     The ``m`` should be padded to a multiple of 4 before calling this function, to accommodate the kernel's requirement.
     """
-    if backend == "trtllm" and _match_sm_version(a.device, ["110"]):
-        raise ValueError("TRTLLM FP8 GEMM is not supported on SM110.")
 
     workspace_buffer = _get_cache_buf(
         "gemm_fp8_nt_groupwise_workspace", DEFAULT_WORKSPACE_SIZE, a.device
     )
-    if a.ndim != 2 or b.ndim != 2:
-        raise ValueError(f"Shape mismatch. a.shape = {a.shape}, b.shape = {b.shape}")
-
-    if a.shape[1] != b.shape[1]:
-        raise ValueError(
-            f"Shape mismatch. a.shape[1] = {a.shape[1]}, b.shape[1] = {b.shape[1]}"
-        )
 
     if out is None:
         out_dtype = out_dtype or torch.bfloat16
     else:
         out_dtype = out.dtype
 
-    _validate_fp8_output_dtype(out_dtype)
-
     # NOTE(Zihao): (out_specified, need_padding)
     # (False, False) -> create out_padded tensor explicitly
     # (False, True) -> create out_padded tensor explicitly
@@ -2212,18 +2581,6 @@ def gemm_fp8_nt_groupwise(
         )
 
     if backend == "cutlass":
-        if not _match_sm_version(a.device, ["100", "103", "110", "120", "121"]):
-            raise ValueError(
-                "gemm_fp8_nt_groupwise is only supported on SM100, SM103, SM110, SM120, or SM121 in cutlass backend."
-            )
-    elif backend == "trtllm":
-        if not _match_sm_version(a.device, ["100", "103"]):
-            raise ValueError(
-                "gemm_fp8_nt_groupwise is only supported on SM100, SM103 in trtllm backend."
-            )
-
-    if backend == "cutlass":
-        assert scale_major_mode is not None
         if is_sm120a_supported(a.device) or is_sm121a_supported(a.device):
             # SM120/121 doesn't use mma_sm parameter
             get_gemm_sm120_module().gemm_fp8_nt_groupwise(
@@ -2251,8 +2608,6 @@ def gemm_fp8_nt_groupwise(
         else:
             raise ValueError(f"Unsupported device for FP8 GEMM: {a.device}")
     elif backend == "trtllm":
-        assert scale_granularity_mnk == (1, 128, 128)
-        assert a.shape[1] >= 256
         # mma_sm is ignored
         get_trtllm_gemm_module().trtllm_gemm(
             workspace_buffer,
@@ -2275,142 +2630,128 @@ def get_trtllm_fp4_gemm_module():
     op = mod.build_and_load()
     setup_cubin_loader(mod.get_library_path())
 
-    class TrtllmFp4GemmRunner(TunableRunner):
-        def __init__(self, use_8x4_sf_layout: bool = True):
-            self._fp4_gemm_runner = op.trtllm_gemm
-            self._use_8x4_sf_layout = use_8x4_sf_layout
+    def trtllm_fp4_gemm_runner(use_8x4_sf_layout: bool = True):
+        class TrtllmFp4GemmRunner(TunableRunner):
+            def __init__(self, use_8x4_sf_layout: bool = True):
+                self._fp4_gemm_runner = op.trtllm_gemm
+                self._use_8x4_sf_layout = use_8x4_sf_layout
 
-        def get_valid_tactics(
-            self,
-            inputs: List[torch.Tensor],
-            profile: OptimizationProfile,
-        ) -> List[int]:
-            a_tensor_index = 1
-            b_tensor_index = 2
-
-            a = profile.get_opt_shapes()[a_tensor_index]
-            b = profile.get_opt_shapes()[b_tensor_index]
-            m = a[0]
-            n = b[0]
-            k = a[1] * 2
-            (
-                workspace_buffer,
-                a,
-                b,
-                a_descale,
-                b_descale,
-                alpha,
-                out,
-            ) = inputs
-            type_e2m1 = 0
-            type_bf16 = 2
-            return list(
-                op.trtllm_gemm_tactics(
-                    m, n, k, type_e2m1, type_bf16, self._use_8x4_sf_layout
+            def get_valid_tactics(
+                self,
+                inputs: List[torch.Tensor],
+                profile: OptimizationProfile,
+            ) -> List[int]:
+                a_tensor_index = 1
+                b_tensor_index = 2
+
+                a = profile.get_opt_shapes()[a_tensor_index]
+                b = profile.get_opt_shapes()[b_tensor_index]
+                m = a[0]
+                n = b[0]
+                k = a[1] * 2
+                (
+                    a,
+                    b,
+                    a_descale,
+                    b_descale,
+                    alpha,
+                    _,
+                    out,
+                    _,
+                    _,
+                    workspace_buffer,
+                ) = inputs
+                type_e2m1 = 0
+                type_bf16 = 2
+                return list(
+                    op.trtllm_gemm_tactics(
+                        m, n, k, type_e2m1, type_bf16, self._use_8x4_sf_layout
+                    )
                 )
-            )
 
-        def forward(
-            self,
-            inputs: List[torch.Tensor],
-            tactic: int = -1,
-            do_preparation: bool = False,
-            **kwargs,
-        ):
-            (
-                workspace_buffer,
-                a,
-                b,
-                a_descale,
-                b_descale,
-                alpha,
-                out,
-            ) = inputs
-            op.trtllm_gemm(
-                workspace_buffer,
-                a,
-                b,
-                a_descale,
-                b_descale,
-                alpha,
-                out,
-                self._use_8x4_sf_layout,
-                tactic,
-            )
-            return out
+            def forward(
+                self,
+                inputs: List[torch.Tensor],
+                tactic: int = -1,
+                do_preparation: bool = False,
+                **kwargs,
+            ):
+                (
+                    a,
+                    b,
+                    a_descale,
+                    b_descale,
+                    alpha,
+                    _,
+                    out,
+                    _,
+                    _,
+                    workspace_buffer,
+                ) = inputs
+                self._fp4_gemm_runner(
+                    workspace_buffer,
+                    a,
+                    b.T,
+                    a_descale,
+                    b_descale.T,
+                    alpha,
+                    out,
+                    self._use_8x4_sf_layout,
+                    tactic,
+                )
+                return out
 
-    @register_custom_op(
-        "flashinfer::trtllm_fp4_gemm",
-        mutates_args=(""),
-    )
-    def trtllm_fp4_gemm(
-        a: torch.Tensor,
-        b: torch.Tensor,
-        a_descale: torch.Tensor,
-        b_descale: torch.Tensor,
-        alpha: torch.Tensor,
-        out: torch.Tensor,
-        use_8x4_sf_layout: bool,
-        workspace_buffer: torch.Tensor,
-    ):
-        tuner = AutoTuner.get()
-
-        a_tensor_index = 1
-        a_scale_tensor_index = 3
-        out_tensor_index = 6
-
-        def pad_up(x, y):
-            return ((x + y - 1) // y) * y
-
-        tuning_config = TuningConfig(
-            dynamic_tensor_specs=(
-                DynamicTensorSpec(
-                    (a_tensor_index,),
-                    (0,),
-                    get_last_power_of_2_num_tokens_buckets,
-                    last_positive_power_of_2,
-                ),
-            ),
-            constraint_specs=(
-                ConstraintSpec(
-                    a_scale_tensor_index,
-                    0,
-                    lambda shapes: pad_up(
-                        shapes[a_tensor_index][0], 8 if use_8x4_sf_layout else 128
-                    ),
-                ),
-                ConstraintSpec(
-                    out_tensor_index, 0, lambda shapes: shapes[a_tensor_index][0]
-                ),
-            ),
-        )
+        return TrtllmFp4GemmRunner(use_8x4_sf_layout)
 
-        fp4_runner = TrtllmFp4GemmRunner(use_8x4_sf_layout)
+    # Register the module
+    return SimpleNamespace(
+        trtllm_fp4_gemm_runner=trtllm_fp4_gemm_runner,
+    )
 
-        inputs = [
-            workspace_buffer,
-            a,
-            b,
-            a_descale,
-            b_descale,
-            alpha,
-            out,
-        ]
-        _, tactic = tuner.choose_one(
-            "trtllm_fp4_gemm_8x4" if use_8x4_sf_layout else "trtllm_fp4_gemm_128x4",
-            [fp4_runner],
-            tuning_config,
-            inputs,
-        )
 
-        fp4_runner(inputs=inputs, tactic=tactic)
+@supported_compute_capability([100, 103, 120, 121])
+def _check_gemm_fp8_nt_blockscaled_problem_size(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_scale: torch.Tensor,
+    scale_major_mode: Optional[Literal["MN", "K"]] = "MN",
+    mma_sm: int = 1,
+    out: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+):
+    _check_gemm_fp8_nt_groupwise_problem_size(
+        a,
+        b,
+        a_scale,
+        b_scale,
+        scale_major_mode,
+        mma_sm,
+        out,
+        out_dtype,
+        backend="cutlass",
+    )
 
-    # Register the module
-    return SimpleNamespace(
-        trtllm_fp4_gemm=trtllm_fp4_gemm,
+    _cutlass_gemm_fp8_nt_groupwise_requirement(
+        a,
+        b,
+        a_scale,
+        b_scale,
+        scale_major_mode,
+        mma_sm,
+        out,
+        out_dtype,
+        backend="cutlass",
     )
 
+    return True
+
 
+@backend_requirement(
+    {},
+    common_check=_check_gemm_fp8_nt_blockscaled_problem_size,
+)
+@flashinfer_api
 def gemm_fp8_nt_blockscaled(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -2439,6 +2780,80 @@ def gemm_fp8_nt_blockscaled(
     )
 
 
+@supported_compute_capability([100, 120, 121])
+def _check_group_gemm_fp8_nt_groupwise_problem_size(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_scale: torch.Tensor,
+    m_indptr: torch.Tensor,
+    scale_granularity_mnk: Tuple[int, int, int] = (1, 128, 128),
+    scale_major_mode: Literal["MN", "K"] = "MN",
+    mma_sm: int = 1,
+    out: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+):
+    if a.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
+        raise ValueError(f"a must be a float8 tensor, but got {a.dtype}")
+    if b.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
+        raise ValueError(f"b must be a float8 tensor, but got {b.dtype}")
+    if a_scale.dtype not in [torch.float32]:
+        raise ValueError(f"a_scale must be a float32 tensor, but got {a_scale.dtype}")
+    if b_scale.dtype not in [torch.float32]:
+        raise ValueError(f"b_scale must be a float32 tensor, but got {b_scale.dtype}")
+    if m_indptr.dtype not in [torch.int32]:
+        raise ValueError(f"m_indptr must be a int32 tensor, but got {m_indptr.dtype}")
+    if scale_major_mode not in ["MN", "K"]:
+        raise ValueError(
+            f"scale_major_mode must be either 'MN' or 'K', but got {scale_major_mode}"
+        )
+    if mma_sm not in [1, 2]:
+        raise ValueError(f"mma_sm must be either 1 or 2, but got {mma_sm}")
+
+    # assert a.shape[0] == m_indptr[-1].item()  # Not enabled in consideration of performance
+    n = b.shape[1]
+    k = b.shape[2]
+
+    if out is None:
+        if out_dtype is None:
+            out_dtype = torch.bfloat16
+    else:
+        if out_dtype is None:
+            out_dtype = out.dtype
+        if out.shape != (a.shape[0], n):
+            raise ValueError(
+                f"Shape mismatch. out.shape = {out.shape}, (a.shape[0], n) = {(a.shape[0], n)}"
+            )
+        if out.dtype != out_dtype:
+            raise ValueError(
+                f"dtype mismatch. out.dtype = {out.dtype}, out_dtype = {out_dtype}"
+            )
+
+    _validate_fp8_output_dtype(out_dtype)
+
+    if a.shape[1] != k:
+        raise ValueError(f"Shape mismatch. a.shape[1] = {a.shape[1]}, k = {k}")
+    if n % 8 != 0:
+        raise ValueError(f"n must be a multiple of 8, but got {n}")
+    if k % 16 != 0:
+        raise ValueError(f"k must be a multiple of 16, but got {k}")
+
+    num_groups = m_indptr.shape[0] - 1
+
+    if is_sm120a_supported(a.device) or is_sm121a_supported(a.device):
+        if num_groups > 1:
+            raise RuntimeError(
+                "group_gemm_fp8_nt_groupwise has correctness issues for num_groups > 1 on SM120/121"
+            )
+
+    return True
+
+
+@backend_requirement(
+    {},
+    common_check=_check_group_gemm_fp8_nt_groupwise_problem_size,
+)
+@flashinfer_api
 def group_gemm_fp8_nt_groupwise(
     a: torch.Tensor,  # (cum_m, k)
     b: torch.Tensor,  # (batch_size, n, k)
@@ -2503,19 +2918,6 @@ def group_gemm_fp8_nt_groupwise(
     Each value in ``m_indptr`` should be padded to a multiple of 4 before calling this function,
     to accommodate the kernel's requirement.
     """
-    if (
-        not is_sm100a_supported(a.device)
-        and not is_sm120a_supported(a.device)
-        and not is_sm121a_supported(a.device)
-    ):
-        raise ValueError(
-            "gemm_fp8_nt_groupwise is only supported on SM100, SM120, and SM121."
-        )
-    if not (_match_sm_version(a.device, ["100", "103", "110", "120", "121"])):
-        raise ValueError(
-            "gemm_fp8_nt_groupwise is only supported on SM100, SM103, SM110, SM120, or SM121."
-        )
-
     int_workspace_buffer = _get_cache_buf(
         "group_gemm_fp8_nt_groupwise_int_workspace", DEFAULT_WORKSPACE_SIZE, a.device
     )
@@ -2523,46 +2925,21 @@ def group_gemm_fp8_nt_groupwise(
         "group_gemm_fp8_nt_groupwise_float_workspace", DEFAULT_WORKSPACE_SIZE, a.device
     )
 
-    assert a.dtype in [torch.float8_e4m3fn, torch.float8_e5m2]
-    assert b.dtype in [torch.float8_e4m3fn, torch.float8_e5m2]
-    assert a_scale.dtype == torch.float32
-    assert b_scale.dtype == torch.float32
-    assert m_indptr.dtype == torch.int32
-    assert scale_major_mode in ["MN", "K"]
-    assert mma_sm in [1, 2]
     if out is None:
         if out_dtype is None:
             out_dtype = torch.bfloat16
     else:
         if out_dtype is None:
             out_dtype = out.dtype
-    _validate_fp8_output_dtype(out_dtype)
 
-    num_groups = m_indptr.shape[0] - 1
-    assert b.shape[0] == num_groups
     n = b.shape[1]
     k = b.shape[2]
 
-    # assert a.shape[0] == m_indptr[-1].item()  # Not enabled in consideration of performance
-    assert a.shape[1] == k
-    align_n = 8
-    align_k = 16
-    assert n % align_n == 0
-    assert k % align_k == 0
-
     out_shape = (a.shape[0], n)
     if out is None:
         out = torch.empty(out_shape, dtype=out_dtype, device=a.device)
-    else:
-        assert out.shape == out_shape
-        assert out.dtype == out_dtype
 
     if is_sm120a_supported(a.device) or is_sm121a_supported(a.device):
-        # it has correctness issues for num_groups > 1
-        if num_groups > 1:
-            raise RuntimeError(
-                "group_gemm_fp8_nt_groupwise has correctness issues for num_groups > 1 on SM120/121"
-            )
         # SM120/121 doesn't use mma_sm parameter
         get_gemm_sm120_module().group_gemm_fp8_nt_groupwise(
             int_workspace_buffer,
@@ -2594,13 +2971,97 @@ def group_gemm_fp8_nt_groupwise(
             scale_major_mode,
             mma_sm,
         )
+    return out
+
+
+@supported_compute_capability([100, 103, 110, 120, 121])
+def _check_group_gemm_mxfp8_mxfp4_nt_groupwise_problem_size(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_scale: torch.Tensor,
+    m_indptr: torch.Tensor,
+    mma_sm: int = 1,
+    tile_m: int = 128,
+    tile_n: int = 128,
+    tile_k: int = 128,
+    swap_ab: bool = True,
+    out: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+):
+    if a.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
+        raise ValueError(
+            f"a must be a float8_e4m3fn or float8_e5m2 tensor, but got {a.dtype}"
+        )
+    if b.dtype != torch.uint8:
+        raise ValueError(f"b must be a uint8 tensor, but got {b.dtype}")
+    if a_scale.dtype != torch.uint8:
+        raise ValueError(f"a_scale must be a uint8 tensor, but got {a_scale.dtype}")
+    if b_scale.dtype != torch.uint8:
+        raise ValueError(f"b_scale must be a uint8 tensor, but got {b_scale.dtype}")
+    if m_indptr.dtype != torch.int32:
+        raise ValueError(f"m_indptr must be a int32 tensor, but got {m_indptr.dtype}")
+    if mma_sm not in [1, 2]:
+        raise ValueError(f"mma_sm must be either 1 or 2, but got {mma_sm}")
+    if tile_m not in [128]:
+        raise ValueError(f"tile_m must be 128, but got {tile_m}")
+    if tile_n not in [64, 128, 192, 256]:
+        raise ValueError(f"tile_n must be one of [64, 128, 192, 256], but got {tile_n}")
+    if tile_k not in [128, 256]:
+        raise ValueError(f"tile_k must be either 128 or 256, but got {tile_k}")
+    if swap_ab not in [True, False]:
+        raise ValueError(f"swap_ab must be a boolean value, but got {swap_ab}")
+
+    # Determine out_dtype if not specified
+    if out is None:
+        if out_dtype is None:
+            out_dtype = torch.bfloat16
     else:
+        if out_dtype is None:
+            out_dtype = out.dtype
+
+    if out_dtype not in [torch.bfloat16, torch.float16]:
         raise ValueError(
-            f"group_gemm_fp8_nt_groupwise requires SM100, SM120, or SM121, but got {a.device}"
+            f"out_dtype must be either torch.bfloat16 or torch.float16, but got {out_dtype}"
         )
-    return out
 
+    num_groups = m_indptr.shape[0] - 1
+    if b.shape[0] != num_groups:
+        raise ValueError(
+            f"b.shape[0] must equal num_groups (m_indptr.shape[0] - 1), but got b.shape[0]={b.shape[0]}, num_groups={num_groups}"
+        )
+
+    n = b.shape[1]
+    k = b.shape[2] * 2  # Multiply by 2 because b is e2m1 packed as uint8
+
+    # assert a.shape[0] == m_indptr[-1].item()  # Not enabled in consideration of performance
+    if a.shape[1] != k:
+        raise ValueError(
+            f"a.shape[1] must equal k, but got a.shape[1]={a.shape[1]}, k={k}"
+        )
+
+    align_n = 8
+    align_k = 128
+    if n % align_n != 0:
+        raise ValueError(f"n must be a multiple of {align_n}, but got n={n}")
+    if k % align_k != 0:
+        raise ValueError(f"k must be a multiple of {align_k}, but got k={k}")
+
+    out_shape = (a.shape[0], n)
+    if out is not None:
+        if out.shape != out_shape:
+            raise ValueError(f"out.shape must be {out_shape}, but got {out.shape}")
+        if out.dtype != out_dtype:
+            raise ValueError(f"out.dtype must be {out_dtype}, but got {out.dtype}")
 
+    return True
+
+
+@backend_requirement(
+    {},
+    common_check=_check_group_gemm_mxfp8_mxfp4_nt_groupwise_problem_size,
+)
+@flashinfer_api
 def group_gemm_mxfp8_mxfp4_nt_groupwise(
     a: torch.Tensor,  # (cum_m, k)
     b: torch.Tensor,  # (batch_size, n, k // 2)
@@ -2677,43 +3138,20 @@ def group_gemm_mxfp8_mxfp4_nt_groupwise(
         DEFAULT_WORKSPACE_SIZE,
         a.device,
     )
-
-    assert a.dtype in [torch.float8_e4m3fn, torch.float8_e5m2]
-    assert b.dtype == torch.uint8
-    assert a_scale.dtype == torch.uint8
-    assert b_scale.dtype == torch.uint8
-    assert m_indptr.dtype == torch.int32
-    assert mma_sm in [1, 2]
-    assert tile_m in [128]
-    assert tile_n in [64, 128, 192, 256]
-    assert tile_k in [128, 256]
-    assert swap_ab in [True, False]
+    # Determine out_dtype if not specified
     if out is None:
         if out_dtype is None:
             out_dtype = torch.bfloat16
     else:
         if out_dtype is None:
             out_dtype = out.dtype
-    assert out_dtype in [torch.bfloat16, torch.float16]
 
-    num_groups = m_indptr.shape[0] - 1
-    assert b.shape[0] == num_groups
     n = b.shape[1]
     k = b.shape[2] * 2  # Multiply by 2 because b is e2m1 packed as uint8
 
-    # assert a.shape[0] == m_indptr[-1].item()  # Not enabled in consideration of performance
-    assert a.shape[1] == k
-    align_n = 8
-    align_k = 128
-    assert n % align_n == 0
-    assert k % align_k == 0
-
     out_shape = (a.shape[0], n)
     if out is None:
         out = torch.empty(out_shape, dtype=out_dtype, device=a.device)
-    else:
-        assert out.shape == out_shape
-        assert out.dtype == out_dtype
 
     get_gemm_sm100_module().group_gemm_mxfp4_nt_groupwise(
         int_workspace_buffer,
@@ -2742,7 +3180,7 @@ def group_gemm_mxfp8_mxfp4_nt_groupwise(
 def pad_indptr_to_multiple_of_4(
     m_indptr: torch.Tensor,
 ):
-    from .triton.gemm import compute_padding_mapping
+    from ..triton.gemm import compute_padding_mapping
 
     batch_size = m_indptr.shape[0] - 1
     m = m_indptr[1:] - m_indptr[:-1]
@@ -2768,6 +3206,31 @@ def get_deepgemm_sm100_module():
     return module
 
 
+@supported_compute_capability([100, 103])
+def _check_group_deepgemm_fp8_nt_groupwise_problem_size(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_scale: torch.Tensor,
+    m_indices: torch.Tensor,
+    scale_granularity_mnk: Tuple[int, int, int] = (1, 128, 128),
+    out: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+) -> bool:
+    from flashinfer.deep_gemm import (
+        _check_group_deepgemm_fp8_nt_contiguous_problem_size,
+    )
+
+    return _check_group_deepgemm_fp8_nt_contiguous_problem_size(
+        (a, a_scale), (b, b_scale), out, m_indices, scale_granularity_mnk
+    )
+
+
+@backend_requirement(
+    {},
+    common_check=_check_group_deepgemm_fp8_nt_groupwise_problem_size,
+)
+@flashinfer_api
 def group_deepgemm_fp8_nt_groupwise(
     a: torch.Tensor,  # (m, k)
     b: torch.Tensor,  # (batch_size, n, k)
@@ -2882,11 +3345,6 @@ def group_deepgemm_fp8_nt_groupwise(
     """
     from flashinfer.deep_gemm import m_grouped_fp8_gemm_nt_contiguous
 
-    if not _match_sm_version(a.device, ["100", "103"]):
-        raise ValueError(
-            "m_grouped_fp8_gemm_nt_contiguous is only supported on SM100, SM100, SM103."
-        )
-
     if out is None:
         out_dtype = out_dtype or torch.bfloat16
         out = torch.empty(a.shape[0], b.shape[1], dtype=out_dtype, device=a.device)
@@ -2898,6 +3356,30 @@ def group_deepgemm_fp8_nt_groupwise(
     return out
 
 
+@supported_compute_capability([100, 103])
+def _check_batch_deepgemm_fp8_nt_groupwise(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_scale: torch.Tensor,
+    masked_m: torch.Tensor,
+    expected_m: int,
+    scale_granularity_mnk: Tuple[int, int, int] = (1, 128, 128),
+    out: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+) -> bool:
+    from flashinfer.deep_gemm import _check_m_grouped_fp8_gemm_nt_masked_problem_size
+
+    return _check_m_grouped_fp8_gemm_nt_masked_problem_size(
+        (a, a_scale), (b, b_scale), out, masked_m, expected_m, scale_granularity_mnk
+    )
+
+
+@backend_requirement(
+    {},
+    common_check=_check_batch_deepgemm_fp8_nt_groupwise,
+)
+@flashinfer_api
 def batch_deepgemm_fp8_nt_groupwise(
     a: torch.Tensor,  # (batch_size, m, k)
     b: torch.Tensor,  # (batch_size, n, k)
@@ -3015,11 +3497,6 @@ def batch_deepgemm_fp8_nt_groupwise(
     """
     from flashinfer.deep_gemm import m_grouped_fp8_gemm_nt_masked
 
-    if not _match_sm_version(a.device, ["100", "103"]):
-        raise ValueError(
-            "m_grouped_fp8_gemm_nt_masked is only supported on SM100, SM103."
-        )
-
     if out is None:
         out_dtype = out_dtype or torch.bfloat16
         out = torch.empty(
diff --git a/flashinfer/gemm/routergemm_dsv3.py b/flashinfer/gemm/routergemm_dsv3.py
new file mode 100644
index 0000000000..c82ccf3fcd
--- /dev/null
+++ b/flashinfer/gemm/routergemm_dsv3.py
@@ -0,0 +1,136 @@
+from ..api_logging import flashinfer_api
+from flashinfer.jit import gen_dsv3_router_gemm_module
+import functools
+from types import SimpleNamespace
+import torch
+from flashinfer.utils import (
+    register_custom_op,
+    supported_compute_capability,
+    backend_requirement,
+)
+
+
+# TODO: other compute capabilities may be supported but are untested
+@supported_compute_capability([100])
+def _mm_M1_16_K7168_N256_shape_checks(mat_a, mat_b, out, launch_with_pdl):
+    # Dimension checks
+    if mat_a.dim() != 2:
+        raise ValueError("mat_a must be a 2D tensor")
+    if mat_b.dim() != 2:
+        raise ValueError("mat_b must be a 2D tensor")
+    if out.dim() != 2:
+        raise ValueError("out must be a 2D tensor")
+
+    # Stride checks (check these before dimension checks to give better error messages)
+    if mat_a.stride(1) != 1:
+        raise ValueError("mat_a must be row-major")
+    if out.stride(1) != 1:
+        raise ValueError("out must be row-major")
+    if mat_b.stride(0) != 1:
+        raise ValueError("mat_b must be column-major")
+
+    if mat_a.shape[1] != mat_b.shape[0]:
+        raise ValueError("mat_a.shape[1] must be equal to mat_b.shape[0]")
+    if out.shape[0] != mat_a.shape[0]:
+        raise ValueError("out.shape[0] must be equal to mat_a.shape[0]")
+    if out.shape[1] != mat_b.shape[1]:
+        raise ValueError("out.shape[1] must be equal to mat_b.shape[1]")
+
+    # Problem size checks
+    expected_hidden_dim = 7168
+    expected_num_experts = 256
+    min_tokens = 1
+    max_tokens = 16
+    if mat_a.shape[0] < min_tokens or mat_a.shape[0] > max_tokens:
+        raise ValueError(
+            f"mat_a.shape[0] (num_tokens) must be between {min_tokens} and {max_tokens}"
+        )
+    if mat_a.shape[1] != expected_hidden_dim:
+        raise ValueError(
+            f"mat_a.shape[1] (hidden_dim) must be equal to {expected_hidden_dim}"
+        )
+    if mat_b.shape[1] != expected_num_experts:
+        raise ValueError(
+            f"mat_b.shape[1] (num_experts) must be equal to {expected_num_experts}"
+        )
+
+    # Data type checks
+    if mat_a.dtype != torch.bfloat16:
+        raise ValueError("mat_a must be a bfloat16 tensor")
+    if mat_b.dtype != torch.bfloat16:
+        raise ValueError("mat_b must be a bfloat16 tensor")
+    if out.dtype != torch.float32:
+        raise ValueError("out must be a float32 tensor")
+
+    return True
+
+
+@functools.cache
+def get_dsv3_router_gemm_module():
+    module = gen_dsv3_router_gemm_module().build_and_load()
+
+    @register_custom_op(
+        "flashinfer::dsv3_router_gemm_op",
+        mutates_args=["out"],
+    )
+    def mm_M1_16_K7168_N256(
+        mat_a: torch.Tensor,
+        mat_b: torch.Tensor,
+        out: torch.Tensor,
+        launch_with_pdl: bool = False,
+    ) -> None:
+        module.dsv3_router_gemm_op(mat_a, mat_b, out, launch_with_pdl)
+
+    return SimpleNamespace(
+        mm_M1_16_K7168_N256=mm_M1_16_K7168_N256,
+    )
+
+
+@flashinfer_api
+@backend_requirement({}, common_check=_mm_M1_16_K7168_N256_shape_checks)
+def mm_M1_16_K7168_N256(
+    mat_a: torch.Tensor,
+    mat_b: torch.Tensor,
+    out: torch.Tensor,
+    launch_with_pdl: bool = False,
+) -> None:
+    """Optimized GEMM for the router operation in DeepSeek-V3.
+
+    This function performs a highly optimized matrix multiplication specifically tailored
+    for the expert routing GEMM in DeepSeek-V3's Mixture of Experts (MoE) architecture.
+    It computes out = mat_a @ mat_b where mat_a contains token embeddings and mat_b
+    contains expert routing weights.
+
+    The implementation is optimized for the specific problem dimensions used in DeepSeek-V3:
+    - Hidden dimension (K): 7168
+    - Number of experts (N): 256
+    - Number of tokens (M): 1-16
+
+    Args:
+        mat_a (torch.Tensor): Input token embeddings of shape (M, K) where M is the number
+            of tokens (1-16) and K is the hidden dimension (7168). Must be bfloat16,
+            row-major (contiguous).
+        mat_b (torch.Tensor): Expert routing weights of shape (K, N) where K is the hidden
+            dimension (7168) and N is the number of experts (256). Must be bfloat16,
+            column-major (transposed layout).
+        out (torch.Tensor): Pre-allocated output tensor of shape (M, N) containing the
+            routing scores. Must be float32, row-major (contiguous). This tensor is
+            mutated in-place.
+        launch_with_pdl (bool, optional): Whether to launch the kernel using Persistent
+            Device-side Launch. Defaults to False.
+
+    Returns:
+        None: The result is written directly to the `out` tensor.
+
+    Raises:
+        ValueError: If tensor dimensions, strides, or data types do not match the
+            expected DeepSeek-V3 router configuration.
+
+    Note:
+        This kernel is specialized for compute capability 10.0 (Blackwell architecture).
+        The specific problem size optimization makes this significantly faster than
+        general-purpose GEMM implementations for the router operation.
+    """
+    get_dsv3_router_gemm_module().mm_M1_16_K7168_N256(
+        mat_a, mat_b, out, launch_with_pdl
+    )
diff --git a/flashinfer/green_ctx.py b/flashinfer/green_ctx.py
index 0555cec212..09962fd467 100644
--- a/flashinfer/green_ctx.py
+++ b/flashinfer/green_ctx.py
@@ -170,12 +170,27 @@ def split_device_green_ctx(
         RuntimeError: when requested SM allocation exceeds device capacity:
         ``num_groups * rounded_min_count > total_device_sms``
     """
-    cu_dev = get_cudevice(dev)
-    resource = get_device_resource(cu_dev)
-    results, remaining = split_resource(resource, num_groups, min_count)
-    resources = results + [remaining]
-    streams = create_green_ctx_streams(cu_dev, resources)
-    return streams, resources
+    try:
+        cu_dev = get_cudevice(dev)
+        resource = get_device_resource(cu_dev)
+        results, remaining = split_resource(resource, num_groups, min_count)
+        resources = results + [remaining]
+        streams = create_green_ctx_streams(cu_dev, resources)
+        return streams, resources
+    except RuntimeError as e:
+        if (
+            "CUDA error code=914" in str(e)
+            or "CUDA_ERROR_INVALID_RESOURCE_TYPE" in str(e)
+            or "CUDA error code=915" in str(e)
+            or "CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION" in str(e)
+        ):
+            raise RuntimeError(
+                f"{e}\n"
+                f"Failed to split device into {num_groups} groups with min_count={min_count}. "
+                f"This is likely due to insufficient number of SMs available on the device. "
+                f"Please reduce the number of groups or the minimum SM count per group."
+            ) from e
+        raise
 
 
 def split_device_green_ctx_by_sm_count(
@@ -241,21 +256,40 @@ def split_device_green_ctx_by_sm_count(
         See `CUDA Green Contexts <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html>`_
         for more details.
     """
-    cu_dev = get_cudevice(dev)
-    resource = get_device_resource(cu_dev)
+    try:
+        cu_dev = get_cudevice(dev)
+        resource = get_device_resource(cu_dev)
+
+        # Round sm counts to meet the alignment and granularity requirements
+        rounded_sm_counts = []
+        for sm_count in sm_counts:
+            min_sm_count, sm_alignment = get_sm_count_constraint(
+                *get_compute_capability(dev)
+            )
+            if sm_count <= 0:
+                raise ValueError(f"SM count must be positive, got {sm_count}")
+            rounded_sm_counts.append(
+                round_up(max(sm_count, min_sm_count), sm_alignment)
+            )
 
-    # Round sm counts to meet the alignment and granularity requirements
-    rounded_sm_counts = []
-    for sm_count in sm_counts:
-        min_sm_count, sm_alignment = get_sm_count_constraint(
-            *get_compute_capability(dev)
+        # Split the device into multiple green contexts
+        results, remaining = split_resource_by_sm_count(
+            cu_dev, resource, rounded_sm_counts
         )
-        if sm_count <= 0:
-            raise ValueError(f"SM count must be positive, got {sm_count}")
-        rounded_sm_counts.append(round_up(max(sm_count, min_sm_count), sm_alignment))
-
-    # Split the device into multiple green contexts
-    results, remaining = split_resource_by_sm_count(cu_dev, resource, rounded_sm_counts)
-    resources = results + [remaining]
-    streams = create_green_ctx_streams(cu_dev, resources)
-    return streams, resources
+        resources = results + [remaining]
+        streams = create_green_ctx_streams(cu_dev, resources)
+        return streams, resources
+    except RuntimeError as e:
+        if (
+            "CUDA error code=914" in str(e)
+            or "CUDA_ERROR_INVALID_RESOURCE_TYPE" in str(e)
+            or "CUDA error code=915" in str(e)
+            or "CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION" in str(e)
+        ):
+            raise RuntimeError(
+                f"{e}\n"
+                f"Failed to split device with SM counts {sm_counts} (rounded to {rounded_sm_counts}). "
+                f"This is likely due to insufficient number of SMs available on the device. "
+                f"Please reduce the requested SM counts or use fewer partitions."
+            ) from e
+        raise
diff --git a/flashinfer/jit/__init__.py b/flashinfer/jit/__init__.py
index 314dee1eb3..6af188eeef 100644
--- a/flashinfer/jit/__init__.py
+++ b/flashinfer/jit/__init__.py
@@ -42,6 +42,7 @@
     gen_customize_single_prefill_module as gen_customize_single_prefill_module,
 )
 from .attention import gen_fmha_cutlass_sm100a_module as gen_fmha_cutlass_sm100a_module
+from .attention import gen_batch_pod_module as gen_batch_pod_module
 from .attention import gen_pod_module as gen_pod_module
 from .attention import gen_single_decode_module as gen_single_decode_module
 from .attention import gen_single_prefill_module as gen_single_prefill_module
@@ -54,6 +55,7 @@
 from .attention import get_single_decode_uri as get_single_decode_uri
 from .attention import get_single_prefill_uri as get_single_prefill_uri
 from .attention import gen_trtllm_gen_fmha_module as gen_trtllm_gen_fmha_module
+from .attention import get_trtllm_fmha_v2_module as get_trtllm_fmha_v2_module
 from .core import JitSpec as JitSpec
 from .core import JitSpecStatus as JitSpecStatus
 from .core import JitSpecRegistry as JitSpecRegistry
@@ -76,6 +78,12 @@
 from .comm import gen_trtllm_comm_module as gen_trtllm_comm_module
 from .comm import gen_vllm_comm_module as gen_vllm_comm_module
 from .comm import gen_nvshmem_module as gen_nvshmem_module
+from .dsv3_optimizations import (
+    gen_dsv3_router_gemm_module as gen_dsv3_router_gemm_module,
+)
+from .dsv3_optimizations import (
+    gen_dsv3_fused_routing_module as gen_dsv3_fused_routing_module,
+)
 
 
 cuda_lib_path = os.environ.get(
diff --git a/flashinfer/jit/attention/__init__.py b/flashinfer/jit/attention/__init__.py
index 2ae6f30729..5f77725973 100644
--- a/flashinfer/jit/attention/__init__.py
+++ b/flashinfer/jit/attention/__init__.py
@@ -33,6 +33,7 @@
     gen_customize_single_prefill_module as gen_customize_single_prefill_module,
 )
 from .modules import gen_fmha_cutlass_sm100a_module as gen_fmha_cutlass_sm100a_module
+from .modules import gen_batch_pod_module as gen_batch_pod_module
 from .modules import gen_pod_module as gen_pod_module
 from .modules import gen_single_decode_module as gen_single_decode_module
 from .modules import gen_single_prefill_module as gen_single_prefill_module
@@ -44,7 +45,9 @@
 from .modules import get_pod_uri as get_pod_uri
 from .modules import get_single_decode_uri as get_single_decode_uri
 from .modules import get_single_prefill_uri as get_single_prefill_uri
+from .modules import get_trtllm_fmha_v2_module as get_trtllm_fmha_v2_module
 from .modules import gen_trtllm_gen_fmha_module as gen_trtllm_gen_fmha_module
+from .modules import gen_trtllm_fmha_v2_module as gen_trtllm_fmha_v2_module
 from .modules import (
     gen_batch_prefill_attention_sink_module as gen_batch_prefill_attention_sink_module,
     get_batch_prefill_attention_sink_uri as get_batch_prefill_attention_sink_uri,
diff --git a/flashinfer/jit/attention/fmha_v2/generate_kernels.py b/flashinfer/jit/attention/fmha_v2/generate_kernels.py
new file mode 100644
index 0000000000..77d1d73466
--- /dev/null
+++ b/flashinfer/jit/attention/fmha_v2/generate_kernels.py
@@ -0,0 +1,182 @@
+import os
+from contextlib import contextmanager
+from pathlib import Path
+
+from .generator_utils import (
+    InputLayout,
+    encode_name,
+    enumerate_hmma_flash_kernels,
+    enumerate_qmma_flash_kernels,
+    generate_files,
+)
+
+
+@contextmanager
+def working_directory(path: Path):
+    """Context manager to temporarily change working directory."""
+    original_dir = os.getcwd()
+    os.chdir(path)
+    try:
+        yield
+    finally:
+        os.chdir(original_dir)
+
+
+def _setup_output_directory(src_target: Path, gen_dir: Path):
+    """Setup output directory with symlinks to TensorRT-LLM source directories."""
+    gen_dir.mkdir(parents=True, exist_ok=True)
+
+    # Create symlink to csrc/fmha_v2/ directory
+    src_link = gen_dir / "src"
+
+    if src_link.is_symlink() or src_link.exists():
+        src_link.unlink()
+    src_link.symlink_to(src_target, target_is_directory=True)
+
+    (gen_dir / "generated").mkdir(exist_ok=True)
+    (gen_dir / "bin").mkdir(exist_ok=True)
+
+
+def enumerate_kernels(src_target: Path, gen_dir: Path):
+    # Setup output directory with symlinks to source headers
+    _setup_output_directory(src_target, gen_dir)
+
+    # Enumerate kernels, emit to generated/ directory
+    with working_directory(gen_dir):
+        specs: list = []
+        enumerate_hmma_flash_kernels(specs, sm=120, dtype="bf16", head_size_v=128)
+        enumerate_qmma_flash_kernels(specs, sm=120, dtype="e4m3_fp32", head_sizes=[192])
+        enumerate_qmma_flash_kernels(
+            specs, sm=120, dtype="e4m3_fp32", head_sizes=[192], output_dtype="bf16"
+        )
+
+        # Expand the cartesian product of the list fields "seq_len" and "head_size".
+        specs_expanded = []
+        list_like = lambda x: isinstance(x, (list, tuple))
+        for kspec in specs:
+            tmp_s = kspec.seq_len
+            tmp_d = kspec.head_size
+            tmp_dtype = kspec.dtype
+            tmp_exp = (
+                [kspec._replace(seq_len=s) for s in tmp_s]
+                if list_like(tmp_s)
+                else [kspec]
+            )
+            tmp_exp = (
+                [tmp_ks._replace(head_size=d) for d in tmp_d for tmp_ks in tmp_exp]
+                if list_like(tmp_d)
+                else tmp_exp
+            )
+            tmp_exp = (
+                [tmp_ks._replace(dtype=dt) for dt in tmp_dtype for tmp_ks in tmp_exp]
+                if list_like(tmp_dtype)
+                else tmp_exp
+            )
+            specs_expanded.extend(tmp_exp)
+
+        # Sanitize kernel specs
+        specs_expanded = [kspec for kspec in specs_expanded if kspec.sm >= kspec.sm_mma]
+
+        specs_names = [
+            (kspec, *encode_name(kspec))
+            for kspec in specs_expanded
+            # Volta is deprecated in TRT-LLM.
+            if (
+                kspec.sm >= 80
+                and kspec.dtype in ["fp16", "bf16", "fp16_fp32", "e4m3", "e4m3_fp32"]
+                and kspec.head_size <= 256
+                and kspec.head_size_v == 0
+                and kspec.sage_block_sizes is None
+                and kspec.version == 2
+                and not kspec.cross_mha
+                and kspec.flash_attention
+                and kspec.input_layout != InputLayout.SEPARATE_Q_K_V
+                or (
+                    kspec.sm == 90
+                    and kspec.dtype in ["fp16", "bf16", "fp16_fp32"]
+                    and kspec.head_size <= 256
+                    and kspec.ldgsts_q
+                    and kspec.version == 2
+                    and not kspec.cross_mha
+                    and not kspec.flash_attention
+                )
+                # Clip/SigLip support.
+                or (
+                    kspec.sm == 100
+                    and kspec.dtype
+                    in ["fp16", "bf16", "fp16_fp32", "e4m3", "e4m3_fp32"]
+                    and kspec.head_size == 80
+                    and kspec.head_size_v == 0
+                    and kspec.sage_block_sizes is None
+                    and kspec.version == 2
+                    and not kspec.cross_mha
+                    and kspec.flash_attention
+                    and kspec.input_layout != InputLayout.SEPARATE_Q_K_V
+                )
+                # Deepseek MLA (generation 576/512 paged)
+                or (
+                    kspec.sm in [90, 100, 120]
+                    and kspec.dtype in ["bf16", "e4m3_fp32"]
+                    and kspec.head_size == 576
+                    and kspec.head_size_v == 512
+                    and kspec.input_layout == InputLayout.Q_PAGED_KV
+                    and kspec.sage_block_sizes is None
+                    and kspec.version == 2
+                    and not kspec.cross_mha
+                    and kspec.flash_attention
+                    and not kspec.warp_specialization
+                    and kspec.tiled
+                )
+                # Deepseek MLA (context 192/128 separate-q-k-v)
+                or (
+                    kspec.sm in [90, 100, 120]
+                    and kspec.dtype in ["bf16", "e4m3", "e4m3_fp32"]
+                    and kspec.head_size == 192
+                    and kspec.head_size_v == 128
+                    and kspec.input_layout == InputLayout.SEPARATE_Q_K_V
+                    and kspec.sage_block_sizes is None
+                    and kspec.version == 2
+                    and not kspec.cross_mha
+                    and kspec.flash_attention
+                    and (
+                        (kspec.warp_specialization and not kspec.alibi)  # sm90
+                        or (not kspec.warp_specialization and kspec.tiled)
+                    )  # non-sm90
+                    and not kspec.enable_attn_logit_softcapping
+                )
+                # SageAttention (warp_spec, head_size in (80, 128), packed QKV, padding mask)
+                or (
+                    kspec.sm == 90
+                    and kspec.head_size in [80, 128]
+                    and kspec.version == 2
+                    and kspec.sage_block_sizes in [(64, 64, 256)]
+                    and not kspec.cross_mha
+                    and kspec.flash_attention
+                    and kspec.warp_specialization
+                    and kspec.input_layout == InputLayout.PACKED_QKV
+                    and not kspec.alibi
+                    and not kspec.enable_attn_logit_softcapping
+                )
+                # SageAttention on Ada (head_size in (80, 128), packed QKV, padding mask)
+                or (
+                    kspec.sm == 89
+                    and kspec.head_size in [80, 128]
+                    and kspec.sage_block_sizes in [(64, 32, 32)]
+                    and kspec.output_dtype in ["fp16", "bf16"]
+                    and kspec.version == 2
+                    and not kspec.cross_mha
+                    and kspec.flash_attention
+                    and not kspec.warp_specialization
+                    and kspec.input_layout == InputLayout.PACKED_QKV
+                )
+            )
+            # only generate head_size = 128/256 for attn_logit_softcapping operation.
+            and (
+                kspec.head_size == 128
+                or kspec.head_size == 256
+                or not kspec.enable_attn_logit_softcapping
+            )
+        ]
+        # yapf: enable
+
+        generate_files(specs_names)
diff --git a/flashinfer/jit/attention/fmha_v2/generator_utils.py b/flashinfer/jit/attention/fmha_v2/generator_utils.py
new file mode 100755
index 0000000000..439261dca6
--- /dev/null
+++ b/flashinfer/jit/attention/fmha_v2/generator_utils.py
@@ -0,0 +1,6927 @@
+# SPDX-FileCopyrightText: Copyright (c) 2020-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import os
+import subprocess
+from collections import namedtuple
+from enum import IntEnum
+from itertools import product
+
+sm2name = {
+    70: "volta",
+    72: "volta",
+    75: "turing",
+    80: "ampere",
+    86: "ampere",
+    87: "ampere",
+    89: "ada",
+    90: "hopper",
+    120: "blackwell",
+}
+
+dtype2traits = {
+    "int8": "imma_int8_int32_traits",
+    "fp16": "hmma_fp16_traits",
+    "fp16_fp32": "hmma_fp32_traits",
+    "bf16": "hmma_bf16_traits",
+    "e4m3": "qmma_e4m3_fp32_traits",
+    "e4m3_fp32": "qmma_e4m3_fp32_traits",
+    "e4m3_fp16": "qmma_e4m3_fp16_traits",
+}
+
+dtype2OutputType = {
+    "int8": "int8_t",
+    "fp16": "fp16_t",
+    "fp16_fp32": "fp16_t",
+    "bf16": "bf16_t",
+    "e4m3": "e4m3_t",
+    "e4m3_fp32": "e4m3_t",
+    "e4m3_fp16": "e4m3_t",
+}
+
+dtype2bytes = {
+    "int8": 1,
+    "fp16": 2,
+    "fp16_fp32": 2,
+    "bf16": 2,
+    "e4m3": 1,
+    "e4m3_fp32": 1,
+    "e4m3_fp16": 1,
+}
+
+# TODO merge with above?
+hopper_dtype2traits = {
+    "int8": "igmma_int8_int32_traits",
+    "fp16": "hgmma_fp16_traits",
+    "fp16_fp32": "hgmma_fp32_traits",
+    "bf16": "hgmma_bf16_traits",
+    "e4m3": "qgmma_e4m3_fp32_traits",
+    "e4m3_fp32": "qgmma_e4m3_fp32_traits",
+}
+
+# The minimal instruction shapes per warp group.
+# TODO should this not be known to the trait itself?
+hopper_traits2shape = {
+    "Hopper_igmma_int8_int32_traits": (64, 8, 32),
+    "Hopper_hgmma_fp16_traits": (64, 8, 16),
+    "Hopper_hgmma_fp32_traits": (64, 8, 16),
+    "Hopper_hgmma_bf16_traits": (64, 8, 16),
+    "Hopper_qgmma_e4m3_fp32_traits": (64, 8, 32),
+}
+
+dtype2typename = {
+    "int8": "DATA_TYPE_INT8",
+    "fp16": "DATA_TYPE_FP16",
+    "fp16_fp32": "DATA_TYPE_FP16",
+    "bf16": "DATA_TYPE_BF16",
+    "e4m3": "DATA_TYPE_E4M3",
+    "e4m3_fp16": "DATA_TYPE_E4M3",
+    "e4m3_fp32": "DATA_TYPE_E4M3",
+}
+
+pythonBoolean2cpp = {True: "true", False: "false"}
+
+
+# same definition as fused_multihead_attention.h.
+class AttentionMaskType(IntEnum):
+    PADDING = 0
+    CAUSAL = 1
+    SLIDING_OR_CHUNKED_CAUSAL = 2
+    CUSTOM_MASK = 3
+
+
+class InputLayout(IntEnum):
+    PACKED_QKV = 0
+    CONTIGUOUS_Q_KV = 1
+    Q_PAGED_KV = 2
+    SEPARATE_Q_K_V = 3
+
+
+spec_fields = (
+    "sm",
+    "dtype",
+    "seq_len",
+    "head_size",
+    "warps_m",
+    "warps_n",
+    "version",
+    "interleaved",
+    "ldgsts_q",
+    "ldgsts_k",
+    "ldgsts_v",
+    "share_smem_k_v",
+    "loop_step",
+    "has_noloop",
+    "noloop_step",
+    "unroll_threshold",
+    "has_scale_max",
+    "ctas_per_head",
+    "sm_mma",
+    "head_interleaved",
+    # new added fields (only used by flash attention implementation)
+    "flash_attention",
+    "kv_loop_step",
+    "flash_attention_bh_upper_threshold",  # to deprecate; not actively used
+    "limit_qk_fragments",
+    "limit_v_fragments",
+    "tiled",
+    # fields for warp specialized kernel
+    "warp_specialization",
+    "q_tile_buffers",
+    "kv_tile_buffers",
+    "scheduling_mode",
+    # attention qkv input layout.
+    "input_layout",
+    # fused MHCA.
+    "cross_mha",
+    # other features
+    "alibi",
+    "enable_attn_logit_softcapping",
+    "return_softmax_stats",
+    "disabled_mask_types",
+    "head_size_v",
+    "sage_block_sizes",
+    "output_dtype",
+    "is_mtp",
+)
+kernel_spec = namedtuple("kernel_spec", spec_fields)  # type: ignore[misc]
+kernel_spec.__new__.__defaults__ = (
+    1,  # ctas_per_head
+    1,  # sm_mma
+    True,  # head_interleaved
+    False,  # flash_attention
+    64,  # kv_loop_step
+    -1,  # flash_attention_bh_upper_threshold
+    False,  # limit_qk_fragments
+    False,  # limit_v_fragments
+    0,  # tiled
+    False,  # warp_specialization
+    1,  # q_tile_buffers
+    1,  # kv_tile_buffers
+    0,  # scheduling_mode
+    InputLayout.PACKED_QKV,
+    0,  # cross_mha
+    True,  # alibi
+    False,  # enable_attn_logit_softcapping
+    False,  # return_softmax_stats
+    None,  # disabled_mask_types
+    0,  # head size of V
+    None,  # sage_block_sizes
+    None,  # output_dtype, same as dtype by default.
+    False,
+)  # use MTP or not
+
+generate_cu_trtllm = os.environ.get("GENERATE_CU_TRTLLM", "False").lower() == "true"
+
+ns_open = (
+    r"""
+namespace tensorrt_llm
+{
+namespace kernels
+{
+// clang-format off
+"""
+    if generate_cu_trtllm
+    else ""
+)
+
+ns_close = (
+    r"""
+// clang-format on
+} // namespace kernels
+} // namespace tensorrt_llm
+"""
+    if generate_cu_trtllm
+    else ""
+)
+
+copyright = (
+    """\
+/***************************************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are not permit-
+ * ted.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+"""
+    if not generate_cu_trtllm
+    else r"""/*
+* SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
+* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+"""
+)
+
+makefile_template = """\
+
+# The combination of supported gencodes.
+GENCODES  = $(GENCODE_SM70)
+GENCODES += $(GENCODE_SM72)
+GENCODES += $(GENCODE_SM75)
+GENCODES += $(GENCODE_SM80)
+GENCODES += $(GENCODE_SM86)
+GENCODES += $(GENCODE_SM87)
+GENCODES += $(GENCODE_SM89)
+GENCODES += $(GENCODE_SM90)
+GENCODES += $(GENCODE_SM100)
+GENCODES += $(GENCODE_SM120)
+
+OBJECTS_MHA  = obj/fused_multihead_attention.cpp.o
+OBJECTS_MHCA = obj/fused_multihead_cross_attention.cpp.o
+
+{objects}
+
+{cubins}
+
+SOFTMAX_SRC   = $(wildcard src/softmax*.cu)
+SOFTMAX_OBJ   = $(patsubst src/softmax%.cu, obj/softmax%.cu.o, $(SOFTMAX_SRC))
+OBJECTS_MHA  += $(SOFTMAX_OBJ)
+OBJECTS_MHA  += obj/convert.cu.o
+OBJECTS_MHCA += $(SOFTMAX_OBJ)
+OBJECTS_MHCA += obj/convert.cu.o
+"""
+
+
+def get_makefile_code(specs_names):
+    objects = "\n".join(
+        [
+            "OBJECTS_MHA += obj/{}.o".format(fname)
+            for kspec, fname, lname, kname in specs_names
+        ]
+    )
+    objects = (
+        objects
+        + "\n"
+        + "\n".join(
+            [
+                "OBJECTS_MHCA += obj/{}.o".format(fname)
+                for kspec, fname, lname, kname in specs_names
+            ]
+        )
+    )
+
+    cubins = "\n".join(
+        [
+            "CUBINS += cubin/{}.cubin".format(fname)
+            for kspec, fname, lname, kname in specs_names
+        ]
+    )
+    return makefile_template.format(objects=objects, cubins=cubins, copyright=copyright)
+
+
+MAX_STGS_PER_LOOP = 4
+
+kernel_template = """\
+{copyright}
+
+//We can disable the FADD trick for archs with F2IP
+#if {disable_fadd_trick} // disable_fadd_trick
+#ifdef USE_I2F_EMULATION_TRICK
+#undef USE_I2F_EMULATION_TRICK
+#endif // USE_I2F_EMULATION_TRICK
+
+#ifdef USE_F2I_EMULATION_TRICK
+#undef USE_F2I_EMULATION_TRICK
+#endif // USE_F2I_EMULATION_TRICK
+#endif // disable_fadd_trick
+
+#include <cuda.h>
+#include <stdexcept>
+
+#if CUDA_VERSION >= {min_cuda_version}
+
+
+#if !{use_multi_cta} // !use_multi_cta
+#include <fused_multihead_attention_kernel_{kernel_variant}.h>
+#endif // !use_multi_cta
+
+#if !{use_multi_cta} && {has_noloop} // !use_multi_cta && has_noloop
+#include <fused_multihead_attention_kernel_1xN_noloop.h>
+#endif // !use_multi_cta && has_noloop
+
+#if {cross_mha} // cross_mha
+#if {has_noloop} // has_noloop
+#include <fused_multihead_cross_attention_kernel_1xN_noloop.h>
+#endif  // has_noloop
+#include <fused_multihead_cross_attention_kernel_1xN.h>
+#endif // cross_mha
+
+#if  {use_multi_cta} // use_multi_cta
+#include <fused_multihead_attention_kernel_1xN_multi_cta.h>
+#endif
+
+using Attention_mask_type = fmha::Attention_mask_type;
+using Launch_params = bert::Fused_multihead_attention_launch_params;
+
+#if !{cross_mha} // !cross_mha
+using Kernel_traits = fmha::{kernel_traits}<
+    fmha::{instruction_traits},
+    {seq_len},
+    {head_size},
+    {loop_step},
+    {warps_m},
+    {warps_n},
+    {ctas_per_head},
+    {kernel_flags}>;
+
+using Kernel_traits_causal = fmha::{kernel_traits}<
+    fmha::{instruction_traits},
+    {seq_len},
+    {head_size},
+    {loop_step},
+    {warps_m},
+    {warps_n},
+    {ctas_per_head},
+    {kernel_flags},
+    /*causal mask*/ 3>;
+#endif // Not cross attention
+
+#if !{use_multi_cta} && !{cross_mha} // !use_multi_cta && !cross_mha
+
+extern "C"
+__global__
+void {kernel_name}({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}<Kernel_traits>(params);
+}}
+
+extern "C"
+__global__
+void {causal_kernel_name}({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}<Kernel_traits_causal>(params);
+}}
+
+void {launcher_name}(
+    const {params_type} &params,
+    const Launch_params &launch_params,
+    cudaStream_t stream){{
+
+  constexpr int smem_size = Kernel_traits::BYTES_PER_SMEM;
+  if( launch_params.attention_mask_type == Attention_mask_type::CAUSAL ) {{
+    if( smem_size >= 48*1024 ) {{
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute({causal_kernel_name},
+                                            cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                            smem_size));
+    }}
+    dim3 grid(params.h, params.b);
+    {causal_kernel_name}<<<grid, Kernel_traits_causal::THREADS, Kernel_traits_causal::BYTES_PER_SMEM, stream>>>(params);
+  }} else {{
+    if( smem_size >= 48*1024 ) {{
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute({kernel_name},
+                                            cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                            smem_size));
+    }}
+    dim3 grid(params.h, params.b);
+    {kernel_name}<<<grid, Kernel_traits::THREADS, Kernel_traits::BYTES_PER_SMEM, stream>>>(params);
+  }}
+}}
+
+#endif // !use_multi_cta && !cross_mha
+
+#if !{use_multi_cta} && {has_noloop} && !{cross_mha} // !use_multi_cta && has_noloop && !cross_mha
+
+using Kernel_traits_nl = fmha::{kernel_traits}<
+    fmha::{instruction_traits},
+    {seq_len},
+    {head_size},
+    {noloop_step},
+    1,
+    {warps_m} * {warps_n},
+    {ctas_per_head},
+    {kernel_flags} | 0x200 /* no_loop flag */ >;
+
+static_assert(Kernel_traits_nl::CTAS_PER_HEAD == 1, "");
+
+using Kernel_traits_nl_causal = fmha::{kernel_traits}<
+    fmha::{instruction_traits},
+    {seq_len},
+    {head_size},
+    {noloop_step},
+    1,
+    {warps_m} * {warps_n},
+    {ctas_per_head},
+    {kernel_flags} | 0x200 /* no_loop flag */,
+    /*causal mask*/ 3>;
+
+static_assert(Kernel_traits_nl_causal::CTAS_PER_HEAD == 1, "");
+static_assert(Kernel_traits_nl_causal::MASK_VERSION == 3, "");
+
+extern "C"
+__global__
+void {kernel_name}_nl({params_type} params){{
+  fused_multihead_attention::device_1xN_nl<Kernel_traits_nl>(params);
+}}
+
+extern "C"
+__global__
+void {causal_kernel_name}_nl({params_type} params){{
+  fused_multihead_attention::device_1xN_nl<Kernel_traits_nl_causal>(params);
+}}
+
+void {launcher_name}_nl(
+    const {params_type} &params,
+    const Launch_params &launch_params,
+    cudaStream_t stream){{
+
+
+  constexpr int loop_iters = ({seq_len} + {noloop_step}-1) / {noloop_step};
+  static_assert(loop_iters * {noloop_step} >= {seq_len}, "");
+  dim3 grid(params.h, params.b, loop_iters);
+  if( launch_params.attention_mask_type == Attention_mask_type::CAUSAL ) {{
+    constexpr int smem_size = Kernel_traits_nl_causal::BYTES_PER_SMEM;
+    if( smem_size >= 48*1024 ) {{
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute({causal_kernel_name}_nl,
+                                            cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                            smem_size));
+    }}
+    {causal_kernel_name}_nl<<<grid, Kernel_traits_nl_causal::THREADS, Kernel_traits_nl_causal::BYTES_PER_SMEM, stream>>>(params);
+  }} else {{
+    constexpr int smem_size = Kernel_traits_nl::BYTES_PER_SMEM;
+    if( smem_size >= 48*1024 ) {{
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute({kernel_name}_nl,
+                                            cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                            smem_size));
+    }}
+    {kernel_name}_nl<<<grid, Kernel_traits_nl::THREADS, Kernel_traits_nl::BYTES_PER_SMEM, stream>>>(params);
+  }}
+}}
+
+#endif // !use_multi_cta && has_noloop && !cross_mha
+
+#if {cross_mha} // cross_mha
+#if !{use_multi_cta} && {has_noloop} // !use_multi_cta && has_noloop
+
+using Kernel_traits_nl = fmha::{kernel_traits}<
+    fmha::{instruction_traits},
+    {seq_len},
+    {head_size},
+    {noloop_step},
+    1,
+    {warps_m} * {warps_n},
+    {ctas_per_head},
+    {kernel_flags}>;
+
+static_assert(Kernel_traits_nl::CTAS_PER_HEAD == 1, "");
+
+extern "C"
+__global__
+void {kernel_name}_nl({params_type} params){{
+  fused_multihead_attention::device_mhca_1xN_nl<Kernel_traits_nl>(params);
+}}
+
+void {launcher_name}_nl(
+    const {params_type} &params,
+    // const Launch_params &launch_params, // TODO
+    cudaStream_t stream){{
+
+  constexpr int smem_size = Kernel_traits_nl::BYTES_PER_SMEM;
+  if( smem_size >= 48*1024 ) {{
+    FMHA_CHECK_CUDA(cudaFuncSetAttribute({kernel_name}_nl,
+                                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                         smem_size));
+  }}
+  const int loop_iters = (params.s_q + {noloop_step}-1) / {noloop_step};
+  // if (loop_iters * {noloop_step} != params.s_q) {{
+  //   throw std::runtime_error("Incorrect seq len -- loop_iters * noloop_step != params.s_q");
+  // }}
+  assert(loop_iters * {noloop_step} >= params.s_q);
+  dim3 grid(params.h, params.b, loop_iters);
+  {kernel_name}_nl<<<grid, Kernel_traits_nl::THREADS, Kernel_traits_nl::BYTES_PER_SMEM, stream>>>(params);
+}}
+
+#endif // !use_multi_cta && has_noloop
+
+#if !{use_multi_cta} // !use_multi_cta
+
+using Kernel_traits = fmha::{kernel_traits}<
+    fmha::{instruction_traits},
+    {seq_len},
+    {head_size},
+    {loop_step},
+    {warps_m},
+    {warps_n},
+    {ctas_per_head},
+    {kernel_flags}>;
+
+extern "C"
+__global__
+void {kernel_name}({params_type} params){{
+  fused_multihead_attention::device_mhca_1xN<Kernel_traits>(params);
+}}
+
+void {launcher_name}(
+    const {params_type} &params,
+    // const Launch_params &launch_params, // TODO
+    cudaStream_t stream){{
+
+  constexpr int smem_size = Kernel_traits::BYTES_PER_SMEM;
+  if( smem_size >= 48*1024 ) {{
+    FMHA_CHECK_CUDA(cudaFuncSetAttribute({kernel_name},
+                                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                         smem_size));
+  }}
+  dim3 grid(params.h, params.b);
+  {kernel_name}<<<grid, Kernel_traits::THREADS, Kernel_traits::BYTES_PER_SMEM, stream>>>(params);
+}}
+
+#endif // !use_multi_cta
+
+#endif // cross_mha
+
+#if {use_multi_cta} // use_multi_cta
+
+// If that assert gets triggered - increase the value of MAX_STGS_PER_LOOP in "setup.py".
+static_assert(Kernel_traits::Gmem_tile_o::STGS_PER_LOOP <= {MAX_STGS_PER_LOOP}, "");
+
+extern "C"
+__global__
+void {kernel_name}({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_multi_cta<Kernel_traits>(params);
+}}
+
+extern "C"
+__global__
+void {causal_kernel_name}({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_multi_cta<Kernel_traits_causal>(params);
+}}
+
+void {launcher_name}(
+    const {params_type} &params,
+    const Launch_params &launch_params,
+    cudaStream_t stream){{
+
+  assert(params.heads_per_wave != 0 && \"Heads per wave is not set, but multi cta is requested\");
+
+  // Clear the barriers and locks.
+  cudaMemsetAsync(params.counters, 0, 3*params.heads_per_wave*sizeof(int), stream);
+
+  // We may use more than 48kB of shared memory.
+  if( launch_params.attention_mask_type == Attention_mask_type::CAUSAL ) {{
+    constexpr int smem_size = Kernel_traits_causal::BYTES_PER_SMEM;
+    if( smem_size >= 48*1024 ) {{
+      FMHA_CHECK_CUDA(cudaFuncSetAttribute({causal_kernel_name},
+                                          cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                          smem_size));
+    }}
+    // Launch one wave.
+    dim3 grid(Kernel_traits_causal::CTAS_PER_HEAD, params.heads_per_wave), block(Kernel_traits_causal::THREADS);
+    void *params_ = (void*) &params;
+    FMHA_CHECK_CUDA(cudaLaunchCooperativeKernel((void*) &{causal_kernel_name}, grid, block, (void**) &params_, smem_size, stream));
+  }} else {{
+    constexpr size_t smem_size = Kernel_traits::BYTES_PER_SMEM;
+    if( smem_size >= 48*1024 ) {{
+      FMHA_CHECK_CUDA(cudaFuncSetAttribute({kernel_name},
+                                          cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                          smem_size));
+    }}
+    // Launch one wave.
+    dim3 grid(Kernel_traits::CTAS_PER_HEAD, params.heads_per_wave), block(Kernel_traits::THREADS);
+    void *params_ = (void*) &params;
+    FMHA_CHECK_CUDA(cudaLaunchCooperativeKernel((void*) &{kernel_name}, grid, block, (void**) &params_, smem_size, stream));
+  }}
+}}
+
+#endif // use_multi_cta
+
+void {launcher_name}_get_max_heads_per_wave(int *heads_per_wave) {{
+#if {use_multi_cta} // use_multi_cta
+    // Determine the number of SMs and CTAs.
+    int dev;
+    cudaGetDevice(&dev);
+    cudaDeviceProp props;
+    FMHA_CHECK_CUDA(cudaGetDeviceProperties(&props, dev));
+
+    // The number of CTAs per SM.
+    constexpr size_t smem_size = Kernel_traits::BYTES_PER_SMEM;
+    int ctas_per_sm;
+    FMHA_CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&ctas_per_sm,
+                                                            &{kernel_name},
+                                                            Kernel_traits::THREADS,
+                                                            smem_size));
+
+    // The number of heads per wave.
+    *heads_per_wave = props.multiProcessorCount * ctas_per_sm / Kernel_traits::CTAS_PER_HEAD;
+#else // use_multi_cta
+    *heads_per_wave = 0;
+#endif // use_multi_cta
+}}
+
+#else // CUDA_VERSION >= {min_cuda_version}
+
+void {launcher_name}(
+    const {params_type} &params,
+    const Launch_params &launch_params,
+    cudaStream_t stream){{
+    assert(false && "Unsupported CUDA version");
+}}
+
+#if {has_noloop} // has_noloop
+
+void {launcher_name}_nl(
+    const {params_type} &params,
+    const Launch_params &launch_params,
+    cudaStream_t stream){{
+    assert(false && "Unsupported CUDA version");
+}}
+
+#endif // has_noloop
+
+#endif // CUDA_VERSION >= {min_cuda_version}
+"""
+
+flash_attention_kernel_template = """\
+{copyright}
+
+//We can disable the FADD trick for archs with F2IP
+#if {disable_fadd_trick} // disable_fadd_trick
+#ifdef USE_I2F_EMULATION_TRICK
+#undef USE_I2F_EMULATION_TRICK
+#endif // USE_I2F_EMULATION_TRICK
+
+#ifdef USE_F2I_EMULATION_TRICK
+#undef USE_F2I_EMULATION_TRICK
+#endif // USE_F2I_EMULATION_TRICK
+#endif // disable_fadd_trick
+
+#include <cuda.h>
+
+#if CUDA_VERSION >= {min_cuda_version}
+
+#include <fused_multihead_flash_attention_kernel_noloop.h>
+#include <fused_multihead_flash_attention_kernel_noloop_tiled.h>
+#include <fused_multihead_flash_attention_kernel.h>
+
+{include_str}
+
+{local_ns_open}
+{bert_launch_params}
+{attn_mask_type_str}
+
+#if 0 // has_noloop (unconditionally disabled since not maintained & not actively used)
+using Kernel_traits = fmha::{kernel_traits}<
+    fmha::{instruction_traits},
+    {kv_loop_step},
+    {head_size},
+    {head_size_v},
+    {loop_step},
+    {warps_m},
+    {warps_n},
+    {ctas_per_head},
+    {kernel_flags}>;
+
+extern "C"
+__global__
+void {kernel_name}({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}<Kernel_traits>(params);
+}}
+
+void {launcher_name}(
+    const {params_type} &params,
+    const Launch_params &launch_params,
+    cudaStream_t stream){{
+
+  constexpr int smem_size = Kernel_traits::BYTES_PER_SMEM;
+  if( smem_size >= 48*1024 ) {{
+    FMHA_CHECK_CUDA(cudaFuncSetAttribute({kernel_name},
+                                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                         smem_size));
+  }}
+  dim3 grid(params.h, params.b);
+  {kernel_name}<<<grid, Kernel_traits::THREADS, Kernel_traits::BYTES_PER_SMEM, stream>>>(params);
+}}
+
+#endif // has_noloop
+
+#if {has_noloop} && !{tiled} // has_noloop && !tiled
+using Kernel_traits_nl = fmha::{kernel_traits}<
+    fmha::{instruction_traits},
+    {kv_loop_step},
+    {head_size},
+    {head_size_v},
+    {noloop_step},
+    {warps_m},
+    {warps_n},
+    {ctas_per_head},
+    {kernel_flags} | 0x200 /* no_loop flag */,
+    /*dense mask*/ 2,
+    /*bmm2_fp16_epilogue*/ true,
+    {output_dtype_},
+    {sage_block_size_q},
+    {sage_block_size_k},
+    {sage_block_size_v}>;
+
+using Kernel_traits_nl_causal = fmha::{kernel_traits}<
+    fmha::{instruction_traits},
+    {kv_loop_step},
+    {head_size},
+    {head_size_v},
+    {noloop_step},
+    {warps_m},
+    {warps_n},
+    {ctas_per_head},
+    {kernel_flags} | 0x200 /* no_loop flag */,
+    /*causal mask*/ 3,
+    /*bmm2_fp16_epilogue*/ true,
+    {output_dtype_}>;
+
+using Kernel_traits_nl_sliding_or_chunked_causal = fmha::{kernel_traits}<
+    fmha::{instruction_traits},
+    {kv_loop_step},
+    {head_size},
+    {head_size_v},
+    {noloop_step},
+    {warps_m},
+    {warps_n},
+    {ctas_per_head},
+    {kernel_flags} | 0x200 /* no_loop flag */,
+    /*sliding window causal mask*/ 4,
+    /*bmm2_fp16_epilogue*/ true,
+    {output_dtype_}>;
+
+using Kernel_traits_nl_custom_mask = fmha::{kernel_traits}<
+    fmha::{instruction_traits},
+    {kv_loop_step},
+    {head_size},
+    {head_size_v},
+    {noloop_step},
+    {warps_m},
+    {warps_n},
+    {ctas_per_head},
+    {kernel_flags} | 0x200 /* no_loop flag */,
+    /*custom mask*/ 5,
+    /*bmm2_fp16_epilogue*/ true,
+    {output_dtype_}>;
+
+#if {padding_mask} // padding_mask
+
+extern "C"
+__global__
+void {kernel_name}_nl({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_nl<Kernel_traits_nl>(params);
+}}
+
+#endif // padding mask
+
+#if {causal_mask} // causal_mask
+
+extern "C"
+__global__
+void {causal_kernel_name}_nl({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_nl<Kernel_traits_nl_causal>(params);
+}}
+
+#endif // causal mask
+
+#if {sliding_or_chunked_causal_mask} // sliding_or_chunked_causal_mask
+
+extern "C"
+__global__
+void {sliding_or_chunked_causal_kernel_name}_nl({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_nl<Kernel_traits_nl_sliding_or_chunked_causal>(params);
+}}
+
+#endif // sliding_or_chunked_causal_mask
+
+#if {custom_mask} // custom_mask
+
+extern "C"
+__global__
+void {custom_mask_kernel_name}_nl({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_nl<Kernel_traits_nl_custom_mask>(params);
+}}
+
+#endif // custom_mask
+
+void {launcher_name}_nl(
+    {const_fused_multihead_attention_params_v2_str} &params,
+    const Launch_params &launch_params,
+    cudaStream_t stream){{
+
+  // runtime q_loop_iters
+  int loop_iters = ( params.s + {noloop_step} - 1 )  / {noloop_step};
+  // dim3 grid(params.h, params.b, loop_iters);
+  dim3 grid(loop_iters, params.h, params.b); // better locality
+  constexpr int smem_size = Kernel_traits_nl::BYTES_PER_SMEM;
+  if( launch_params.attention_mask_type == Attention_mask_type::CAUSAL ) {{
+#if {causal_mask} // causal_mask
+    if( smem_size >= 48*1024 ) {{
+      FMHA_CHECK_CUDA(cudaFuncSetAttribute({causal_kernel_name}_nl,
+                                           cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                           smem_size));
+    }}
+    {causal_kernel_name}_nl<<<grid, Kernel_traits_nl::THREADS, Kernel_traits_nl::BYTES_PER_SMEM, stream>>>({params_str});
+#endif // causal mask
+  }} else if( launch_params.attention_mask_type == Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL ) {{
+#if {sliding_or_chunked_causal_mask} // sliding_or_chunked_causal_mask
+    if( smem_size >= 48*1024 ) {{
+       FMHA_CHECK_CUDA(cudaFuncSetAttribute({sliding_or_chunked_causal_kernel_name}_nl,
+                                        cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                        smem_size));
+    }}
+    {sliding_or_chunked_causal_kernel_name}_nl<<<grid, Kernel_traits_nl::THREADS, Kernel_traits_nl::BYTES_PER_SMEM, stream>>>({params_str});
+#endif // sliding_or_chunked_causal_mask
+  }} else if( launch_params.attention_mask_type == Attention_mask_type::PADDING ) {{
+#if {padding_mask} // padding_mask
+    if( smem_size >= 48*1024 ) {{
+      FMHA_CHECK_CUDA(cudaFuncSetAttribute({kernel_name}_nl,
+                                           cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                           smem_size));
+    }}
+    {kernel_name}_nl<<<grid, Kernel_traits_nl::THREADS, Kernel_traits_nl::BYTES_PER_SMEM, stream>>>({params_str});
+#endif // padding_mask
+  }} else if( launch_params.attention_mask_type == Attention_mask_type::CUSTOM_MASK ) {{
+#if {custom_mask} // custom_mask
+    if( smem_size >= 48*1024 ) {{
+      FMHA_CHECK_CUDA(cudaFuncSetAttribute({custom_mask_kernel_name}_nl,
+                                           cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                           smem_size));
+    }}
+    {custom_mask_kernel_name}_nl<<<grid, Kernel_traits_nl::THREADS, Kernel_traits_nl::BYTES_PER_SMEM, stream>>>({params_str});
+#endif // custom mask
+  }}
+}}
+
+#endif // has_noloop && !tiled
+
+#if {tiled} // tiled
+
+using Kernel_traits_nl_tiled = fmha::{kernel_traits}<
+    fmha::{instruction_traits},
+    {kv_loop_step},
+    {head_size},
+    {head_size_v},
+    {noloop_step},
+    {warps_m},
+    {warps_n},
+    {ctas_per_head},
+    {kernel_flags} | 0x200 /* no_loop flag */,
+    /*dense mask*/ 2,
+    /*bmm2_fp16_epilogue*/ true,
+    {output_dtype_},
+    {sage_block_size_q},
+    {sage_block_size_k},
+    {sage_block_size_v}>;
+
+using Kernel_traits_nl_tiled_causal = fmha::{kernel_traits}<
+    fmha::{instruction_traits},
+    {kv_loop_step},
+    {head_size},
+    {head_size_v},
+    {noloop_step},
+    {warps_m},
+    {warps_n},
+    {ctas_per_head},
+    {kernel_flags} | 0x200 /* no_loop flag */,
+    /*causal mask*/ 3,
+    /*bmm2_fp16_epilogue*/ true,
+    {output_dtype_}>;
+
+using Kernel_traits_nl_tiled_sliding_or_chunked_causal = fmha::{kernel_traits}<
+    fmha::{instruction_traits},
+    {kv_loop_step},
+    {head_size},
+    {head_size_v},
+    {noloop_step},
+    {warps_m},
+    {warps_n},
+    {ctas_per_head},
+    {kernel_flags} | 0x200 /* no_loop flag */,
+    /*sliding window causal mask*/ 4,
+    /*bmm2_fp16_epilogue*/ true,
+    {output_dtype_}>;
+
+using Kernel_traits_nl_tiled_custom_mask = fmha::{kernel_traits}<
+    fmha::{instruction_traits},
+    {kv_loop_step},
+    {head_size},
+    {head_size_v},
+    {noloop_step},
+    {warps_m},
+    {warps_n},
+    {ctas_per_head},
+    {kernel_flags} | 0x200 /* no_loop flag */,
+    /*custom mask*/ 5,
+    /*bmm2_fp16_epilogue*/ true,
+    {output_dtype_}>;
+
+#if {padding_mask} // padding_mask
+
+extern "C"
+__global__
+void {kernel_name}_nl_tiled({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_nl_tiled<Kernel_traits_nl_tiled>(params);
+}}
+
+#endif // padding_mask
+
+#if {causal_mask} // causal_mask
+
+extern "C"
+__global__
+void {causal_kernel_name}_nl_tiled({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_nl_tiled<Kernel_traits_nl_tiled_causal>(params);
+}}
+
+#endif // causal mask
+
+#if {sliding_or_chunked_causal_mask} // sliding_or_chunked_causal_mask
+
+extern "C"
+__global__
+void {sliding_or_chunked_causal_kernel_name}_nl_tiled({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_nl_tiled<Kernel_traits_nl_tiled_sliding_or_chunked_causal>(params);
+}}
+
+#endif // sliding_or_chunked_causal_mask
+
+#if {custom_mask} // custom_mask
+
+extern "C"
+__global__
+void {custom_mask_kernel_name}_nl_tiled({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_nl_tiled<Kernel_traits_nl_tiled_custom_mask>(params);
+}}
+
+#endif // custom mask
+
+// Granular tiling
+void {launcher_name}_nl_tiled(
+    {const_fused_multihead_attention_params_v2_str} &params,
+    const Launch_params &launch_params,
+    cudaStream_t stream){{
+  // runtime q_loop_iters
+  using Cta_tile_o = typename Kernel_traits_nl_tiled::Cta_tile_o;
+  int ctas_per_o_row = (params.d + Cta_tile_o::N - 1) / Cta_tile_o::N;
+  int loop_iters = ( params.s + {noloop_step} - 1 )  / {noloop_step};
+  dim3 grid(loop_iters * ctas_per_o_row, params.h, params.b);
+  constexpr int smem_size = Kernel_traits_nl_tiled::BYTES_PER_SMEM;
+  if( launch_params.attention_mask_type == Attention_mask_type::CAUSAL ) {{
+#if {causal_mask} // causal_mask
+    if( smem_size >= 48*1024 ) {{
+      FMHA_CHECK_CUDA(cudaFuncSetAttribute({causal_kernel_name}_nl_tiled,
+                                           cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                           smem_size));
+    }}
+    {causal_kernel_name}_nl_tiled<<<grid, Kernel_traits_nl_tiled::THREADS, Kernel_traits_nl_tiled::BYTES_PER_SMEM, stream>>>({params_str});
+#endif // causal mask
+  }} else if( launch_params.attention_mask_type == Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL ) {{
+#if {sliding_or_chunked_causal_mask} // sliding_or_chunked_causal_mask
+    if( smem_size >= 48*1024 ) {{
+       FMHA_CHECK_CUDA(cudaFuncSetAttribute({sliding_or_chunked_causal_kernel_name}_nl_tiled,
+                                        cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                        smem_size));
+    }}
+    {sliding_or_chunked_causal_kernel_name}_nl_tiled<<<grid, Kernel_traits_nl_tiled::THREADS, Kernel_traits_nl_tiled::BYTES_PER_SMEM, stream>>>({params_str});
+#endif // sliding_or_chunked_causal_mask
+  }} else if( launch_params.attention_mask_type == Attention_mask_type::PADDING ) {{
+#if {padding_mask} // padding_mask
+    if( smem_size >= 48*1024 ) {{
+      FMHA_CHECK_CUDA(cudaFuncSetAttribute({kernel_name}_nl_tiled,
+                                           cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                           smem_size));
+    }}
+    {kernel_name}_nl_tiled<<<grid, Kernel_traits_nl_tiled::THREADS, Kernel_traits_nl_tiled::BYTES_PER_SMEM, stream>>>({params_str});
+#endif // padding_mask
+  }} else if( launch_params.attention_mask_type == Attention_mask_type::CUSTOM_MASK ) {{
+#if {custom_mask} // custom_mask
+    if( smem_size >= 48*1024 ) {{
+      FMHA_CHECK_CUDA(cudaFuncSetAttribute({custom_mask_kernel_name}_nl_tiled,
+                                           cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                           smem_size));
+    }}
+    {custom_mask_kernel_name}_nl_tiled<<<grid, Kernel_traits_nl_tiled::THREADS, Kernel_traits_nl_tiled::BYTES_PER_SMEM, stream>>>({params_str});
+#endif // custom mask
+  }}
+}}
+
+#endif // tiled
+
+#else // CUDA_VERSION >= {min_cuda_version}
+
+void {launcher_name}(const {params_type} &params, cudaStream_t stream){{
+    assert(false && "Unsupported CUDA version");
+}}
+
+void {launcher_name}_nl(const {params_type} &params, cudaStream_t stream){{
+    assert(false && "Unsupported CUDA version");
+}}
+
+void {launcher_name}_nl_tiled(const {params_type} &params, cudaStream_t stream){{
+    assert(false && "Unsupported CUDA version");
+}}
+
+#endif // CUDA_VERSION >= {min_cuda_version}
+{local_ns_close}
+"""
+
+kernel_hopper_template = """\
+{copyright}
+
+//We can disable the FADD trick for archs with F2IP
+#if {disable_fadd_trick}
+#ifdef USE_I2F_EMULATION_TRICK
+#undef USE_I2F_EMULATION_TRICK
+#endif
+
+#ifdef USE_F2I_EMULATION_TRICK
+#undef USE_F2I_EMULATION_TRICK
+#endif
+#endif
+
+#include <cuda.h>
+
+#if CUDA_VERSION >= {min_cuda_version}
+
+#include <fused_multihead_attention_kernel_{kernel_variant}.h>
+#if {has_noloop}
+#include <fused_multihead_attention_kernel_{kernel_variant}_noloop.h>
+#endif
+
+#if {use_tma}
+// only included if tma is used.
+#include <fmha/hopper/tma_descriptor.h>
+#endif //use_tma
+
+{include_str}
+{local_ns_open}
+{bert_launch_params}
+{attn_mask_type_str}
+
+using Traits_p = fmha::{instruction_traits_p};
+using Traits_o = fmha::{instruction_traits_o};
+
+using Kernel_traits = {kernel_traits}<
+                       Traits_p,
+                       Traits_o,
+                       {seq_len},
+                       {head_size},
+                       {loop_step},
+                       {warps_m},
+                       {warps_n},
+                       2,
+                       {kernel_flags}>;
+
+using Kernel_traits_causal = {kernel_traits}<
+                              Traits_p,
+                              Traits_o,
+                              {seq_len},
+                              {head_size},
+                              {loop_step},
+                              {warps_m},
+                              {warps_n},
+                              3,
+                              {kernel_flags}>;
+
+using Kernel_traits_sliding_or_chunked_causal = {kernel_traits}<
+                                           Traits_p,
+                                           Traits_o,
+                                           {seq_len},
+                                           {head_size},
+                                           {loop_step},
+                                           {warps_m},
+                                           {warps_n},
+                                           4,
+                                           {kernel_flags}>;
+
+#if {use_tma} // use_tma
+
+#if {padding_mask} // padding_mask
+
+extern "C"
+__global__
+void {kernel_name}(const __grid_constant__ {params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_tma<Kernel_traits>(params);
+}}
+
+#endif // padding_mask
+
+#if {causal_mask} // causal_mask
+
+extern "C"
+__global__
+void {causal_kernel_name}(const __grid_constant__ {params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_tma<Kernel_traits_causal>(params);
+}}
+
+#endif // causal mask
+
+#if {sliding_or_chunked_causal_mask} // sliding_or_chunked_causal_mask
+
+extern "C"
+__global__
+void {sliding_or_chunked_causal_kernel_name}(const __grid_constant__ {params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_tma<Kernel_traits_sliding_or_chunked_causal>(params);
+}}
+
+#endif // sliding_or_chunked_causal_mask
+
+#else
+
+#if {padding_mask}
+
+extern "C"
+__global__
+void {kernel_name}(const __grid_constant__ {params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}<Kernel_traits>(params);
+}}
+
+#endif // padding_mask
+
+#if {causal_mask} // causal_mask
+
+extern "C"
+__global__
+void {causal_kernel_name}(const __grid_constant__ {params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}<Kernel_traits_causal>(params);
+}}
+
+#endif // causal mask
+
+#if {sliding_or_chunked_causal_mask} // sliding_or_chunked_causal_mask
+
+extern "C"
+__global__
+void {sliding_or_chunked_causal_kernel_name}(const __grid_constant__ {params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}<Kernel_traits_sliding_or_chunked_causal>(params);
+}}
+#endif
+
+#endif // sliding_or_chunked_causal_mask
+
+void {launcher_name}({fused_multihead_attention_params_v2_str} &params,
+    const Launch_params &launch_params, cudaStream_t stream){{
+  // setting TMA descriptors if needed.
+  // use_tma = {use_tma}
+#if {use_tma}
+    // declare TMA desc for Q, K, V
+    typename fmha::Multiple_tma_descriptor<3> tma_desc_QKV;
+
+    // GMEM pointers, the offset between each batch is d*3*h*seqlen
+    // qkv pointer
+    char *qkv_ptr = reinterpret_cast<char*>(params.qkv_ptr);
+
+    // tensor size
+    uint32_t tensor_size_qkv[3];
+    tensor_size_qkv[2] = 1;
+    tensor_size_qkv[1] = params.is_s_padded ? params.s * params.b : launch_params.seqlens[params.b];
+    tensor_size_qkv[0] = (params.h + 2 * params.h_kv) * params.d;
+
+    // box size for Q
+    uint32_t box_size_q[3];
+    box_size_q[2] = 1;
+    box_size_q[1] = {loop_step}; // STEP size
+    box_size_q[0] = {head_size}; // head_size
+
+    // box size for k and v
+    uint32_t box_size_kv[3];
+    box_size_kv[2] = 1;
+    box_size_kv[1] = params.s; // S, should not be actual_s, OOB will be filled with zeros.
+    box_size_kv[0] = {head_size}; // head_size
+
+    // stride size
+    uint64_t tensor_stride_qkv[2];
+    tensor_stride_qkv[0] = tensor_size_qkv[0] * Traits_p::BITS_PER_ELEMENT_A / 8;
+    tensor_stride_qkv[1] = tensor_size_qkv[1] * tensor_stride_qkv[0];
+
+    // traversal stride
+    uint32_t traversal_stride_qkv[3] = {{1, 1, 1}};
+
+    // OOB fill zeros
+    uint32_t oob_fill = 0;
+
+    // FP32 to TF32 conversion disabled
+    uint32_t fp32_to_tf32 = 0;
+
+    //setup the descriptors
+
+    //setup the descriptor for Q
+    tma_desc_QKV.set_tma_desctriptor(reinterpret_cast<void*>(qkv_ptr),
+                                fmha::cudaTmaDescFormat::F16_RN, // tma format (data type). For now hardcode to fp16
+                                fmha::cudaTmaDescInterleave::INTERLEAVE_DISABLED,
+                                fmha::cudaTmaDescSwizzle::SWIZZLE_128B,
+                                fmha::cudaTmaDescPromotion::PROMOTION_DISABLED,
+                                tensor_size_qkv,
+                                tensor_stride_qkv,
+                                traversal_stride_qkv,
+                                box_size_q,
+                                oob_fill,
+                                fp32_to_tf32,
+                                &params.tma_desc_q);
+
+    // setup the descriptor for K
+    tma_desc_QKV.set_tma_desctriptor(reinterpret_cast<void*>(qkv_ptr),
+                                fmha::cudaTmaDescFormat::F16_RN, // tma format (data type). For now hardcode to fp16
+                                fmha::cudaTmaDescInterleave::INTERLEAVE_DISABLED,
+                                fmha::cudaTmaDescSwizzle::SWIZZLE_128B,
+                                fmha::cudaTmaDescPromotion::PROMOTION_DISABLED,
+                                tensor_size_qkv,
+                                tensor_stride_qkv,
+                                traversal_stride_qkv,
+                                box_size_kv,
+                                oob_fill,
+                                fp32_to_tf32,
+                                &params.tma_desc_k);
+
+    // setup the descriptor for V
+    tma_desc_QKV.set_tma_desctriptor(reinterpret_cast<void*>(qkv_ptr),
+                                fmha::cudaTmaDescFormat::F16_RN, // tma format (data type). For now hardcode to fp16
+                                fmha::cudaTmaDescInterleave::INTERLEAVE_DISABLED,
+                                fmha::cudaTmaDescSwizzle::SWIZZLE_128B,
+                                fmha::cudaTmaDescPromotion::PROMOTION_DISABLED,
+                                tensor_size_qkv,
+                                tensor_stride_qkv,
+                                traversal_stride_qkv,
+                                box_size_kv,
+                                oob_fill,
+                                fp32_to_tf32,
+                                &params.tma_desc_v);
+
+
+#endif // use_tma
+  dim3 grid(params.h, params.b);
+  // Use the same smem_size for all traits.
+  constexpr int smem_size = Kernel_traits::BYTES_PER_SMEM;
+  if( launch_params.attention_mask_type == Attention_mask_type::CAUSAL ) {{
+#if {causal_mask} // causal_mask
+    if( smem_size >= 48*1024 ) {{
+       FMHA_CHECK_CUDA(cudaFuncSetAttribute({causal_kernel_name},
+                                        cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                        smem_size));
+    }}
+    {causal_kernel_name}<<<grid, Kernel_traits::THREADS, Kernel_traits::BYTES_PER_SMEM, stream>>>({params_str});
+#endif // causal mask
+  }} else if( launch_params.attention_mask_type == Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL ) {{
+#if {sliding_or_chunked_causal_mask} // sliding_or_chunked_causal_mask
+    if( smem_size >= 48*1024 ) {{
+       FMHA_CHECK_CUDA(cudaFuncSetAttribute({sliding_or_chunked_causal_kernel_name},
+                                        cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                        smem_size));
+    }}
+    {sliding_or_chunked_causal_kernel_name}<<<grid, Kernel_traits::THREADS, Kernel_traits::BYTES_PER_SMEM, stream>>>({params_str});
+#endif // sliding_or_chunked_causal_mask
+  }} else {{
+#if {padding_mask} // padding_mask
+    constexpr int smem_size = Kernel_traits::BYTES_PER_SMEM;
+    if( smem_size >= 48*1024 ) {{
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute({kernel_name},
+                                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                         smem_size));
+    }}
+    {kernel_name}<<<grid, Kernel_traits::THREADS, Kernel_traits::BYTES_PER_SMEM, stream>>>({params_str});
+#endif // padding_mask
+  }}
+}}
+
+#if {has_noloop}
+
+
+using Kernel_traits_nl = {kernel_traits}<
+                          Traits_p,
+                          Traits_o,
+                          {seq_len},
+                          {head_size},
+                          {noloop_step},
+                          {warps_m},
+                          {warps_n},
+                          2,
+                          {kernel_flags}>;
+
+using Kernel_traits_causal_nl = {kernel_traits}<
+                                 Traits_p,
+                                 Traits_o,
+                                 {seq_len},
+                                 {head_size},
+                                 {noloop_step},
+                                 {warps_m},
+                                 {warps_n},
+                                 3,
+                                 {kernel_flags}>;
+
+using Kernel_traits_sliding_or_chunked_causal_nl = {kernel_traits}<
+                                              Traits_p,
+                                              Traits_o,
+                                              {seq_len},
+                                              {head_size},
+                                              {noloop_step},
+                                              {warps_m},
+                                              {warps_n},
+                                              4,
+                                              {kernel_flags}>;
+
+#if {padding_mask} // padding_mask
+
+extern "C"
+__global__
+void {kernel_name}_nl({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_nl<Kernel_traits_nl>(params);
+}}
+
+#endif // padding_mask
+
+#if {causal_mask} // causal_mask
+
+extern "C"
+__global__
+void {causal_kernel_name}_nl({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_nl<Kernel_traits_causal_nl>(params);
+}}
+
+#endif // causal mask
+
+#if {sliding_or_chunked_causal_mask} // sliding_or_chunked_causal_mask
+
+extern "C"
+__global__
+void {sliding_or_chunked_causal_kernel_name}_nl({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_nl<Kernel_traits_sliding_or_chunked_causal_nl>(params);
+}}
+
+#endif // sliding_or_chunked_causal_mask
+
+void {launcher_name}_nl({fused_multihead_attention_params_v2_str} &params,
+    const Launch_params& launch_params, cudaStream_t stream){{
+  constexpr int loop_iters = {seq_len} / {noloop_step};
+  static_assert(loop_iters * {noloop_step} == {seq_len}, "");
+  dim3 grid(params.h, params.b, loop_iters);
+
+  // Use the same smem_size for all traits.
+  constexpr int smem_size = Kernel_traits::BYTES_PER_SMEM;
+  if( launch_params.attention_mask_type == Attention_mask_type::CAUSAL ) {{
+#if {causal_mask} // causal_mask
+    if( smem_size >= 48*1024 ) {{
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute({causal_kernel_name}_nl,
+                                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                         smem_size));
+    }}
+    {causal_kernel_name}_nl<<<grid, Kernel_traits_nl::THREADS, Kernel_traits_nl::BYTES_PER_SMEM, stream>>>({params_str});
+#endif // causal mask
+  }} else if( launch_params.attention_mask_type == Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL ) {{
+#if {sliding_or_chunked_causal_mask} // sliding_or_chunked_causal_mask
+    if( smem_size >= 48*1024 ) {{
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute({sliding_or_chunked_causal_kernel_name}_nl,
+                                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                         smem_size));
+    }}
+    {sliding_or_chunked_causal_kernel_name}_nl<<<grid, Kernel_traits_nl::THREADS, Kernel_traits_nl::BYTES_PER_SMEM, stream>>>({params_str});
+#endif // sliding_or_chunked_causal_mask
+  }} else {{
+#if {padding_mask} // padding_mask
+    if( smem_size >= 48*1024 ) {{
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute({kernel_name}_nl,
+                                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                         smem_size));
+    }}
+    {kernel_name}_nl<<<grid, Kernel_traits_nl::THREADS, Kernel_traits_nl::BYTES_PER_SMEM, stream>>>({params_str});
+#endif // padding_mask
+  }}
+}}
+
+#endif
+
+#else
+
+void {launcher_name}(const {params_type} &params, cudaStream_t stream){{
+    assert(false && "Unsupported CUDA version");
+}}
+
+#if {has_noloop}
+
+void {launcher_name}_nl(const {params_type} &params, cudaStream_t stream){{
+    assert(false && "Unsupported CUDA version");
+}}
+
+#endif
+
+#endif
+{local_ns_close}
+"""
+
+kernel_hopper_warp_specialization_template = """\
+{copyright}
+
+#include <fused_multihead_attention_utils.h>
+#include <fmha/hopper/gmma_descriptor.h>
+#include <fmha/hopper/smem_tile.h>
+#include <fmha/utils.h>
+#include <fmha/hopper/compute_tile.h>
+
+#include <fmha/warpspec/kernel_traits.h>
+#include <fmha/warpspec/dma.h>
+#include <fmha/warpspec/compute.h>
+
+{include_str}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+{local_ns_open}
+#if CUDA_VERSION >= {min_cuda_version}
+
+static constexpr int DMA2COMPUTE_DEPTH = 1;
+{num_compute_groups_str}
+static constexpr bool USE_TMA_STORE = {use_tma_store_flag};
+
+{bert_launch_params}
+{attn_mask_type_str}
+
+using Ktraits = {kernel_traits_header}
+                {loop_step},
+                {kv_loop_step},
+                {head_size},
+                {head_size_v},
+                {q_tile_buffers},
+                {kv_tile_buffers},
+                NUM_COMPUTE_GROUPS,
+                DMA2COMPUTE_DEPTH,
+                0,
+                {heads_interleaved_flag},
+                false,
+                {enable_mutex_flag},
+                {scheduling_mode},
+                {input_layout_flag},
+                USE_TMA_STORE,
+                {enable_attn_logit_softcapping_flag},
+                {return_softmax_stats_flag},
+                {output_dtype_},
+                {sage_block_size_q},
+                {sage_block_size_k},
+                {sage_block_size_v}>;
+
+using Ktraits_causal = {kernel_traits_header}
+                       {loop_step},
+                       {kv_loop_step},
+                       {head_size},
+                       {head_size_v},
+                       {q_tile_buffers},
+                       {kv_tile_buffers},
+                       NUM_COMPUTE_GROUPS,
+                       DMA2COMPUTE_DEPTH,
+                       1,
+                       {heads_interleaved_flag},
+                       {has_alibi},
+                       {enable_mutex_flag},
+                       {scheduling_mode},
+                       {input_layout_flag},
+                       USE_TMA_STORE,
+                       {enable_attn_logit_softcapping_flag},
+                       {return_softmax_stats_flag},
+                       {output_dtype_}>;
+
+using Ktraits_sliding_or_chunked_causal = {kernel_traits_header}
+                                      {loop_step},
+                                      {kv_loop_step},
+                                      {head_size},
+                                      {head_size_v},
+                                      {q_tile_buffers},
+                                      {kv_tile_buffers},
+                                      NUM_COMPUTE_GROUPS,
+                                      DMA2COMPUTE_DEPTH,
+                                      2,
+                                      {heads_interleaved_flag},
+                                      {has_alibi},
+                                      {enable_mutex_flag},
+                                      {scheduling_mode},
+                                      {input_layout_flag},
+                                      USE_TMA_STORE && false,
+                                      {enable_attn_logit_softcapping_flag},
+                                      {return_softmax_stats_flag},
+                                      {output_dtype_}>;
+
+using Ktraits_custom_mask = {kernel_traits_header}
+                            {loop_step},
+                            {kv_loop_step},
+                            {head_size},
+                            {head_size_v},
+                            {q_tile_buffers},
+                            {kv_tile_buffers},
+                            NUM_COMPUTE_GROUPS,
+                            DMA2COMPUTE_DEPTH,
+                            3,
+                            {heads_interleaved_flag},
+                            {has_alibi},
+                            {enable_mutex_flag},
+                            {scheduling_mode},
+                            {input_layout_flag},
+                            USE_TMA_STORE && false,
+                            {enable_attn_logit_softcapping_flag},
+                            {return_softmax_stats_flag},
+                            {output_dtype_}>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if {padding_mask} // padding_mask
+
+using Shared = typename Ktraits::Shared;
+
+extern "C"
+__global__ __launch_bounds__(Ktraits::THREADS, 1)
+void {kernel_name}(
+    const __grid_constant__ {params_type} params){{
+
+    extern __shared__ char smem_[];
+    char *smem_aligned = fmha::align_1024(smem_);
+
+    Shared *shared = reinterpret_cast<Shared *>(&smem_aligned[0]);
+    shared->init(threadIdx.x == 0);
+    __syncthreads();
+
+    // special trick to avoid wrap_sync (leads to illegal instruction)
+    int warp_group = __shfl_sync(0xffffffff, threadIdx.x / 128, 0);
+    int tidx = threadIdx.x % 128;
+
+    if( warp_group == NUM_COMPUTE_GROUPS ) {{  // dma + sched
+
+        {setmaxnreg_dma_str}
+        uint32_t elect_one = tidx == 0;
+
+        // Need all threads involved when the dam group needs to transpose the v tile explicltly.
+        if constexpr ( Ktraits::DMA_GROUP_TRANSPOSE_V ) {{
+            fmha::ws::DMA<Ktraits>::Device dma_device(elect_one);
+            dma_device.{run_fct_name}(params, shared);
+        }} else {{
+            fmha::ws::DMA<Ktraits>::Device dma_device(elect_one);
+            if( tidx < 32 ) {{
+                dma_device.{run_fct_name}(params, shared);
+            }}
+        }}
+
+    }} else {{  // math
+
+        {setmaxnreg_compute_str}
+
+        fmha::ws::Compute<fmha::{instruction_traits}, Ktraits> compute;
+        compute.run(warp_group, tidx, shared, params);
+    }}
+}}
+
+#endif // padding mask
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if {causal_mask} // causal_mask
+
+using Shared_causal = typename Ktraits_causal::Shared;
+
+extern "C"
+__global__ __launch_bounds__(Ktraits_causal::THREADS, 1)
+void {causal_kernel_name}(
+    const __grid_constant__ {params_type} params){{
+
+    extern __shared__ char smem_[];
+    char *smem_aligned = fmha::align_1024(smem_);
+
+    Shared_causal *shared = reinterpret_cast<Shared_causal *>(&smem_aligned[0]);
+    shared->init(threadIdx.x == 0);
+    __syncthreads();
+
+    // special trick to avoid wrap_sync (leads to illegal instruction)
+    int warp_group = __shfl_sync(0xffffffff, threadIdx.x / 128, 0);
+    int tidx = threadIdx.x % 128;
+
+    if( warp_group == NUM_COMPUTE_GROUPS ) {{  // dma + sched
+
+        {setmaxnreg_dma_str}
+        uint32_t elect_one = tidx == 0;
+
+        // Need all threads involved when the dam group needs to transpose the v tile explicltly.
+        if constexpr ( Ktraits_causal::DMA_GROUP_TRANSPOSE_V ) {{
+            fmha::ws::DMA<Ktraits_causal>::Device dma_device(elect_one);
+            dma_device.{run_fct_name}(params, shared);
+        }} else {{
+            fmha::ws::DMA<Ktraits_causal>::Device dma_device(elect_one);
+            if( tidx < 32 ) {{
+                dma_device.{run_fct_name}(params, shared);
+            }}
+        }}
+
+    }} else {{  // math
+
+        {setmaxnreg_compute_str}
+
+        fmha::ws::Compute<fmha::{instruction_traits}, Ktraits_causal> compute;
+        compute.run(warp_group, tidx, shared, params);
+    }}
+}}
+
+#endif // causal mask
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if {sliding_or_chunked_causal_mask} // sliding_or_chunked_causal_mask
+
+using Shared_sliding_or_chunked_causal = typename Ktraits_sliding_or_chunked_causal::Shared;
+
+extern "C"
+__global__ __launch_bounds__(Ktraits_sliding_or_chunked_causal::THREADS, 1)
+void {sliding_or_chunked_causal_kernel_name}(
+    const __grid_constant__ {params_type} params){{
+
+    extern __shared__ char smem_[];
+    char *smem_aligned = fmha::align_1024(smem_);
+
+    Shared_sliding_or_chunked_causal *shared =
+        reinterpret_cast<Shared_sliding_or_chunked_causal *>(&smem_aligned[0]);
+    shared->init(threadIdx.x == 0);
+    __syncthreads();
+
+    // special trick to avoid wrap_sync (leads to illegal instruction)
+    int warp_group = __shfl_sync(0xffffffff, threadIdx.x / 128, 0);
+    int tidx = threadIdx.x % 128;
+
+    if( warp_group == NUM_COMPUTE_GROUPS ) {{  // dma + sched
+
+        {setmaxnreg_dma_str}
+        uint32_t elect_one = tidx == 0;
+
+        // Need all threads involved when the dam group needs to transpose the v tile explicltly.
+        if constexpr ( Ktraits_sliding_or_chunked_causal::DMA_GROUP_TRANSPOSE_V ) {{
+            fmha::ws::DMA<Ktraits_sliding_or_chunked_causal>::Device dma_device(elect_one);
+            dma_device.{run_fct_name}(params, shared);
+        }} else {{
+            fmha::ws::DMA<Ktraits_sliding_or_chunked_causal>::Device dma_device(elect_one);
+            if( tidx < 32 ) {{
+                dma_device.{run_fct_name}(params, shared);
+            }}
+        }}
+
+    }} else {{  // math
+
+        {setmaxnreg_compute_str}
+
+        fmha::ws::Compute<fmha::{instruction_traits}, Ktraits_sliding_or_chunked_causal> compute;
+        compute.run(warp_group, tidx, shared, params);
+    }}
+}}
+
+#endif // sliding_or_chunked_causal_mask
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if {custom_mask} // custom_mask
+
+using Shared_custom_mask = typename Ktraits_custom_mask::Shared;
+
+extern "C"
+__global__ __launch_bounds__(Ktraits_custom_mask::THREADS, 1)
+void {custom_mask_kernel_name}(
+    const __grid_constant__ {params_type} params){{
+
+    extern __shared__ char smem_[];
+    char *smem_aligned = fmha::align_1024(smem_);
+
+    Shared_custom_mask *shared =
+        reinterpret_cast<Shared_custom_mask *>(&smem_aligned[0]);
+    shared->init(threadIdx.x == 0);
+    __syncthreads();
+
+    // special trick to avoid wrap_sync (leads to illegal instruction)
+    int warp_group = __shfl_sync(0xffffffff, threadIdx.x / 128, 0);
+    int tidx = threadIdx.x % 128;
+
+    if( warp_group == NUM_COMPUTE_GROUPS ) {{  // dma + sched
+
+        {setmaxnreg_dma_str}
+        uint32_t elect_one = tidx == 0;
+
+        // Need all threads involved when the dam group needs to transpose the v tile explicltly.
+        if constexpr ( Ktraits_custom_mask::DMA_GROUP_TRANSPOSE_V ) {{
+            fmha::ws::DMA<Ktraits_custom_mask>::Device dma_device(elect_one);
+            dma_device.{run_fct_name}(params, shared);
+        }} else {{
+            fmha::ws::DMA<Ktraits_custom_mask>::Device dma_device(elect_one);
+            if( tidx < 32 ) {{
+                dma_device.{run_fct_name}(params, shared);
+            }}
+        }}
+
+    }} else {{  // math
+
+        {setmaxnreg_compute_str}
+
+        fmha::ws::Compute<fmha::{instruction_traits}, Ktraits_custom_mask> compute;
+        compute.run(warp_group, tidx, shared, params);
+    }}
+}}
+
+#endif // custom_mask
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void {launcher_name}(
+    {fused_multihead_attention_params_v2_str} &params,
+    const Launch_params &launch_params, cudaStream_t stream){{
+
+    {TMA_config}
+    if( Ktraits::SCHEDULING_MODE > 0 ) {{
+        FMHA_CHECK_CUDA(cudaMemsetAsync(params.tile_id_counter_ptr, 0, sizeof(uint32_t), stream));
+    }}
+
+    dim3 block_size;
+
+    if( Ktraits::SCHEDULING_MODE == 0 ) {{
+        block_size.y = std::min(params.b * params.h, launch_params.multi_processor_count);
+        // distribute m steps to multiple blocks (fully utilize SMs)
+        // block.x = blocks that handle single head, block.y = blocks that handle different heads
+        size_t sms_per_head = (launch_params.multi_processor_count) / block_size.y;
+        // Take multiple compute groups into consideration.
+        size_t m_steps = size_t((params.s + {loop_step} * NUM_COMPUTE_GROUPS - 1) / ({loop_step} * NUM_COMPUTE_GROUPS));
+
+        // 2 * {bytes_per_elt} stands for kv cache and {bytes_per_elt} bytes per element.
+        size_t size_in_bytes = block_size.y * params.s * params.d * 2 * {bytes_per_elt};
+        if( size_in_bytes <= launch_params.device_l2_cache_size ) {{
+            // strategy 1: limit to only 1 wave
+            block_size.x = std::min(m_steps, sms_per_head);
+        }} else {{
+            // strategy 2: fully unroll the q loops (contiguous blocks handle all q loops)
+            block_size.x = m_steps;
+        }}
+        params.num_tiles = params.b * params.h;
+    }} else if( Ktraits::SCHEDULING_MODE == 1 ) {{
+        // Get the max total M steps
+        // Take multiple compute groups into consideration.
+        size_t m_steps = size_t((params.s + {loop_step} * NUM_COMPUTE_GROUPS - 1) / ({loop_step} * NUM_COMPUTE_GROUPS));
+        params.num_tiles_per_head = static_cast<uint32_t>(m_steps);
+        params.num_tiles = static_cast<uint32_t>(m_steps * params.b * params.h);
+        if (launch_params.attention_mask_type == Attention_mask_type::CAUSAL) {{
+            // 2 * {bytes_per_elt} stands for kv cache and {bytes_per_elt} bytes per element.
+            size_t size_in_bytes = params.b * params.h * params.s * params.d * 2 * {bytes_per_elt};
+            params.use_balanced_scheduling = (size_in_bytes <= launch_params.device_l2_cache_size);
+        }}
+
+        block_size.x = 1;
+        block_size.y = std::min(static_cast<int>(params.num_tiles), launch_params.multi_processor_count);
+    }} else {{
+        assert(false && "Invalid SCHEDULING_MODE");
+    }}
+
+    // Reuse the same bytes_per_smem for launching kernels.
+    constexpr int SMEM_BYTES = Ktraits::BYTES_PER_SMEM;
+    if( launch_params.attention_mask_type == Attention_mask_type::PADDING ) {{
+#if {padding_mask} // padding_mask
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute({kernel_name},
+                                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                         SMEM_BYTES));
+
+        {kernel_name}
+            <<<block_size, Ktraits::THREADS, SMEM_BYTES, stream>>>({params_str});
+#endif // padding_mask
+    }} else if( launch_params.attention_mask_type == Attention_mask_type::CAUSAL ) {{
+#if {causal_mask} // causal_mask
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute({causal_kernel_name},
+                                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                         SMEM_BYTES));
+
+        {causal_kernel_name}
+            <<<block_size, Ktraits::THREADS, SMEM_BYTES, stream>>>({params_str});
+#endif // causal mask
+    }} else if( launch_params.attention_mask_type == Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL ) {{
+#if {sliding_or_chunked_causal_mask} // sliding_or_chunked_causal_mask
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute({sliding_or_chunked_causal_kernel_name},
+                                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                         SMEM_BYTES));
+
+        {sliding_or_chunked_causal_kernel_name}
+            <<<block_size, Ktraits::THREADS, SMEM_BYTES, stream>>>({params_str});
+#endif // sliding_or_chunked_causal_mask
+    }} else if( launch_params.attention_mask_type == Attention_mask_type::CUSTOM_MASK ) {{
+#if {custom_mask} // custom_mask
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute({custom_mask_kernel_name},
+                                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                         SMEM_BYTES));
+
+        {custom_mask_kernel_name}
+            <<<block_size, Ktraits::THREADS, SMEM_BYTES, stream>>>({params_str});
+#endif // custom mask
+    }}
+
+}}
+
+#endif
+{local_ns_close}
+"""
+
+
+def encode_name(kernel_spec):
+    effective_sm, sm_name = get_effective_sm_and_name(kernel_spec)
+    # Is it a kernel for the interleaved NC/32HW32 INT8 layout?
+    il_tag = "_il" if kernel_spec.interleaved else ""
+    # Is it using the quantization scaling factor as an approximation of the max in softmax?
+    scale_max_tag = "_scale_max" if kernel_spec.has_scale_max else ""
+    # Deal with multi-CTA kernels for which the sequence length is seq_len per CTA * # of CTAs.
+    seqlen = kernel_spec.seq_len * kernel_spec.ctas_per_head
+    # The qkv layout.
+    qkv_layout_tag = ""
+    if kernel_spec.input_layout == InputLayout.PACKED_QKV:
+        qkv_layout_tag = "_qkv"
+    elif kernel_spec.input_layout == InputLayout.Q_PAGED_KV:
+        qkv_layout_tag = "_q_paged_kv"
+    elif kernel_spec.input_layout == InputLayout.SEPARATE_Q_K_V:
+        qkv_layout_tag = "_q_k_v"
+    else:
+        qkv_layout_tag = "_q_kv"
+    # for SM90 kernels, let's also differentiate ldgsts and tma kernels
+    feature_tags = ""
+    if effective_sm == 90:
+        # let's think about where to insert tma/ldgsts in the string before MR. [Timmy]
+        if kernel_spec.ldgsts_q:
+            tma_or_ldgsts = "_ldgsts"
+        else:
+            tma_or_ldgsts = "_tma"
+        if kernel_spec.warp_specialization:
+            warp_specialization_tag = "_ws"
+            # hopper warp-specialized kernels has specialized optimization for cases without alibi.
+            if kernel_spec.alibi:
+                feature_tags += "_alibi"
+            if kernel_spec.return_softmax_stats:
+                feature_tags += "_softmax"
+        else:
+            warp_specialization_tag = ""
+    else:
+        tma_or_ldgsts = ""
+        warp_specialization_tag = ""
+
+    if kernel_spec.enable_attn_logit_softcapping:
+        feature_tags += "_softcapping"
+    if kernel_spec.sage_block_sizes:
+        feature_tags += f"_sage_{'_'.join(map(str, kernel_spec.sage_block_sizes))}"
+    if kernel_spec.output_dtype:
+        feature_tags += f"_output_{kernel_spec.output_dtype}"
+    if kernel_spec.ctas_per_head > 1:
+        fmt = (
+            "fmha_v{version}{il_tag}_{dtype}_"
+            + str(seqlen)
+            + "_{head_size}{attrib}{scale_max_tag}{tma_or_ldgsts}_sm{sm}"
+        )
+    elif kernel_spec.flash_attention:
+        fmt = "fmha_v{version}{il_tag}_flash_attention_{dtype}_{loop_step}_{kv_loop_step}_S{qkv_layout_tag}_{head_size}{head_size_v_str}{attrib}{feature_tags}{scale_max_tag}{tma_or_ldgsts}{warp_specialization_tag}_sm{sm}"
+    elif kernel_spec.cross_mha:
+        fmt = "fmha_mhca_{dtype}_{seq_len}_{head_size}{scale_max_tag}{tma_or_ldgsts}_sm{sm}"
+    else:
+        fmt = "fmha_v{version}{il_tag}_{dtype}_{seq_len}_{head_size}{attrib}{scale_max_tag}{tma_or_ldgsts}_sm{sm}"
+    head_size_v_str = (
+        "" if kernel_spec.head_size_v == 0 else f"x{kernel_spec.head_size_v}"
+    )
+    # Assemble the name of the kernel.
+    name_base = fmt.format(
+        **kernel_spec._asdict(),
+        head_size_v_str=head_size_v_str,
+        il_tag=il_tag,
+        qkv_layout_tag=qkv_layout_tag,
+        scale_max_tag=scale_max_tag,
+        tma_or_ldgsts=tma_or_ldgsts,
+        warp_specialization_tag=warp_specialization_tag,
+        feature_tags=feature_tags,
+        attrib="__placeholder__",
+    )
+
+    # Produce file, launch function and kernel names.
+    fname = name_base.replace("__placeholder__", "")
+    if seqlen >= 1024 and not kernel_spec.flash_attention:
+        fname += ".no_i2f_f2i"
+    fname += ".cu"
+    lname = ("run_" + name_base).replace("__placeholder__", "")
+    kname = name_base + "_kernel"
+
+    # remove causal
+    fname = fname.replace("causal_", "")
+    return fname, lname, kname
+
+
+def get_GMMA_shape(instruction_traits, m, n, k, warps_n):
+    gmma_k = hopper_traits2shape[instruction_traits][-1]
+
+    # gmma shape is 64xgmma_nx16, gmma_n should be as big as possible, but not bigger than n
+    # gmma_n should also be smaller than 256
+    gmma_m = 64
+    gmma_n = 0
+    # find the largest supported n
+    n_supported = [(i + 1) * 8 for i in range(32)][::-1]
+    n_target = n // warps_n
+    assert n_target * warps_n == n
+    assert n_supported[0] == 256 and n_supported[-1] == 8
+    for cand_n in n_supported:
+        if n_target % cand_n == 0:
+            gmma_n = cand_n
+            break
+    assert gmma_n > 0, "No supported GMMA_N found!"
+
+    return gmma_m, gmma_n, gmma_k
+
+
+def enable_mutex(kspec):
+    fp32_accu_dtype = kspec.dtype in ["fp16_fp32", "bf16"]
+    enable_mutex = "false" if (fp32_accu_dtype or kspec.head_size <= 64) else "true"
+    return enable_mutex
+
+
+def enable_tma_store(kspec):
+    output_dtype = kspec.output_dtype if kspec.output_dtype is not None else kspec.dtype
+    # TMA copies data in the 16B granularity.
+    return (
+        "true"
+        if (output_dtype in ["e4m3", "e4m3_fp32"] and kspec.head_size % 16 == 0)
+        else "false"
+    )
+
+
+def get_reg_count(kspec):
+    # if kspec.paged_kv_input and kspec.dtype in ['fp16', 'fp16_fp32', 'bf16']:
+    #     dma_reg_count = 72
+    #     compute_reg_count = 216
+    if kspec.input_layout == InputLayout.Q_PAGED_KV:
+        dma_reg_count = 56
+        compute_reg_count = 224
+    else:
+        dma_reg_count = 40
+        compute_reg_count = 232
+    return dma_reg_count, compute_reg_count
+
+
+def get_hopper_instruction_traits(instruction_traits, kernel_spec):
+    gmma_shape_p = get_GMMA_shape(
+        instruction_traits,
+        kernel_spec.loop_step,
+        kernel_spec.seq_len,
+        kernel_spec.head_size,
+        kernel_spec.warps_n,
+    )
+
+    instruction_traits_p = f"{instruction_traits}<{', '.join([str(x) for x in gmma_shape_p])}, false, false>"
+
+    gmma_shape_o = get_GMMA_shape(
+        instruction_traits,
+        kernel_spec.loop_step,
+        kernel_spec.head_size,
+        kernel_spec.seq_len,
+        1,
+    )
+    instruction_traits_o = f"{instruction_traits}<{', '.join([str(x) for x in gmma_shape_o])}, true, false>"
+
+    return instruction_traits_p, instruction_traits_o
+
+
+def get_effective_sm_and_name(kspec):
+    sm = kspec.sm
+    # Override the mma instruction with an older one.
+    if kspec.sm_mma in sm2name:
+        assert kspec.sm_mma <= kspec.sm, (
+            "Instruction version should be at most target arch"
+        )
+        sm = kspec.sm_mma
+    sm_name = sm2name[sm]
+    return sm, sm_name
+
+
+def selected_mask_types(kspec):
+    # by default, we generate all combinations.
+    # '1' means true, '0' means false.
+    padding_mask = "1"
+    causal_mask = "1"
+    sliding_or_chunked_causal_mask = "1"
+    custom_mask = "1"
+    # only generate certain needed combinations of input_layout and mask types for trt-llm.
+    if "GENERATE_CUBIN" in os.environ:
+        if kspec.sage_block_sizes:
+            # SageAttention only needs padding mask now
+            causal_mask = "0"
+            sliding_or_chunked_causal_mask = "0"
+            custom_mask = "0"
+        elif (kspec.head_size, kspec.head_size_v) == (192, 128):
+            # MLA context phase only needs causal mask and padding mask (for chunked prefill) now
+            sliding_or_chunked_causal_mask = "0"
+            custom_mask = "0"
+        elif (kspec.head_size, kspec.head_size_v) == (576, 512):
+            # MLA generation phase only needs padding mask (MtpMask) now
+            causal_mask = "0"
+            sliding_or_chunked_causal_mask = "0"
+            custom_mask = "0"
+        # encoder models (head_size = 32 / 64 / 128) need packed_qkv input layout + padding mask.
+        elif kspec.input_layout == InputLayout.PACKED_QKV:
+            # NOTE: 72/80 are added for vision transformer
+            if kspec.head_size not in [32, 64, 72, 80, 128]:
+                padding_mask = "0"
+        # only cross attention (head_size = 32/64/128) needs contiguous_q_kv input layout + padding mask / custom_mask.
+        elif kspec.input_layout == InputLayout.CONTIGUOUS_Q_KV:
+            causal_mask = "0"
+            sliding_or_chunked_causal_mask = "0"
+            if kspec.head_size not in [32, 64, 72, 128]:
+                padding_mask = "0"
+                custom_mask = "0"
+        # paged kv cache is always needed in gpt variants.
+        # cross-attention also needs paged kv cache.
+        elif kspec.input_layout == InputLayout.Q_PAGED_KV:
+            if kspec.head_size not in [32, 64, 128]:
+                padding_mask = "0"
+
+        # alibi specialized kernels only need causal mask.
+        if kspec.alibi and kspec.warp_specialization:
+            padding_mask = "0"
+            sliding_or_chunked_causal_mask = "0"
+            custom_mask = "0"
+
+        # enable_attn_logit_softcapping kernels only need causal mask or sliding_or_chunked_causal_mask.
+        if kspec.enable_attn_logit_softcapping:
+            padding_mask = "0"
+            custom_mask = "0"
+
+    return padding_mask, causal_mask, sliding_or_chunked_causal_mask, custom_mask
+
+
+def get_kernel_code(kspec, kname, lname):
+    min_cuda_version = 0  # no restriction
+
+    # The architecture that determines the instruction.
+    effective_sm, sm_name = get_effective_sm_and_name(kspec)
+
+    if effective_sm >= 80:
+        min_cuda_version = 11000
+
+    launcher_name = lname
+    causal_kernel_name = kname.replace("__placeholder__", "_causal")
+    custom_mask_kernel_name = kname.replace("__placeholder__", "_custom_mask")
+    sliding_or_chunked_causal_kernel_name = kname.replace(
+        "__placeholder__", "_sliding_or_chunked_causal"
+    )
+    kernel_name = kname.replace("__placeholder__", "")
+
+    # FIXME: use separate parameters when generating cubins for trtllm.
+    if not kspec.cross_mha:
+        params_type = "bert::Fused_multihead_attention_params_v{}".format(kspec.version)
+    else:
+        params_type = "bert::Fused_multihead_attention_params_mhca"
+
+    if effective_sm < 90:
+        instruction_traits = sm_name.capitalize() + "_" + dtype2traits[kspec.dtype]
+    elif effective_sm == 90:
+        instruction_traits = (
+            sm_name.capitalize() + "_" + hopper_dtype2traits[kspec.dtype]
+        )
+        # for hopper, we differentiate instruction_traits_o and instruction_traits_p
+        instruction_traits_p, instruction_traits_o = get_hopper_instruction_traits(
+            instruction_traits, kspec
+        )
+        # print(instruction_traits_p, instruction_traits_o)
+
+    if effective_sm < 90:
+        if kspec.flash_attention:
+            kernel_variant = "flash_attention"
+        else:
+            kernel_variant = "1xN" if kspec.warps_m == 1 else "2x2"
+    elif effective_sm == 90:
+        if kspec.warps_n > 1:
+            # for hopper we slice the problem along the M dim.
+            kernel_variant = "4xN" + "_hopper"
+        else:
+            kernel_variant = "4x1" + "_hopper"
+
+    if effective_sm < 90:
+        kernel_traits = "Kernel_traits_"
+    elif effective_sm == 90:
+        kernel_traits = "FMHA_kernel_traits_hopper_"
+
+    if kspec.interleaved:
+        kernel_traits += "interleaved_v2"
+    elif kspec.cross_mha:
+        kernel_traits += "fmhca"
+    else:
+        kernel_traits += "v{}".format(kspec.version)
+
+    # decide whether to paged_kv kernel traits for ampere-style kernels.
+    if effective_sm < 90:
+        if kspec.input_layout == InputLayout.Q_PAGED_KV:
+            kernel_traits += "_paged_kv_cache"
+        elif kspec.input_layout == InputLayout.CONTIGUOUS_Q_KV:
+            kernel_traits += "_contiguous_kv_cache"
+        elif kspec.input_layout == InputLayout.SEPARATE_Q_K_V:
+            kernel_traits += "_q_k_v"
+
+    flags = 0
+    if kspec.ldgsts_q:
+        flags |= 1
+    if kspec.ldgsts_k:
+        flags |= 2
+    if kspec.ldgsts_v:
+        flags |= 4
+    if kspec.share_smem_k_v and not kspec.limit_qk_fragments:
+        flags |= 8
+    if kspec.has_scale_max:
+        flags |= 16
+    if not kspec.head_interleaved:
+        flags |= 32
+    if kspec.limit_qk_fragments:
+        flags |= 128
+    if kspec.limit_v_fragments:
+        flags |= 256
+    if kspec.has_noloop:
+        # NOTE do not use flags 512 = 0x200 as it is reserved; do not add to flags because it
+        # will be selectively added to no-loop kernel trait upon generating .cu templates
+        pass
+    if kspec.enable_attn_logit_softcapping:
+        flags |= 2048
+    if kspec.tiled:
+        flags |= 4096
+    if kspec.is_mtp:
+        flags |= 8192
+
+    # only generate certain needed combinations of input_layout and mask types for trt-llm.
+    padding_mask, causal_mask, sliding_or_chunked_causal_mask, custom_mask = (
+        selected_mask_types(kspec)
+    )
+
+    if any(
+        selected_mask_flag == "1" for selected_mask_flag in selected_mask_types(kspec)
+    ):
+        padding_mask, causal_mask, sliding_or_chunked_causal_mask, custom_mask = (
+            selected_mask_types(kspec)
+        )
+    else:
+        return None
+
+    kernel_flags = "0x{:02x}u".format(flags)
+
+    heads_interleaved_flag = pythonBoolean2cpp[kspec.head_interleaved]
+
+    disable_fadd_trick = (
+        1 if effective_sm >= 86 else 0
+    )  # this will force generating F2IP
+
+    enable_mutex_flag = enable_mutex(kspec)
+
+    has_alibi = pythonBoolean2cpp[kspec.alibi]
+
+    input_layout_flag = str(int(kspec.input_layout))
+
+    run_fct_name = (
+        "run_packed_qkv"
+        if kspec.input_layout == InputLayout.PACKED_QKV
+        else "run_separate_q_and_kv"
+    )
+
+    dma_reg_count, compute_reg_count = get_reg_count(kspec)
+
+    use_tma_store_flag = enable_tma_store(kspec)
+
+    enable_attn_logit_softcapping_flag = pythonBoolean2cpp[
+        kspec.enable_attn_logit_softcapping
+    ]
+
+    return_softmax_stats_flag = pythonBoolean2cpp[kspec.return_softmax_stats]
+
+    # needed by warpspec kernels.
+    fp8_kernel = kspec.dtype in ["e4m3", "e4m3_fp32"]
+    kernel_traits_header = (
+        "fmha::ws::Kernel_traits_Hopper_qgmma_e4m3_fp32<"
+        if fp8_kernel
+        else f"fmha::ws::Kernel_traits<fmha::{instruction_traits},"
+    )
+
+    # output type.
+    output_dtype_ = f"fmha::{dtype2OutputType[kspec.output_dtype if kspec.output_dtype is not None else kspec.dtype]}"
+
+    # sage attention block sizes.
+    sage_block_size_q = 0
+    sage_block_size_k = 0
+    sage_block_size_v = 0
+    if fp8_kernel and kspec.sage_block_sizes:
+        assert kspec.output_dtype is not None, (
+            "output_dtype must be specified for fp8 sage attention kernels"
+        )
+        sage_block_size_q = kspec.sage_block_sizes[0]
+        sage_block_size_k = kspec.sage_block_sizes[1]
+        sage_block_size_v = kspec.sage_block_sizes[2]
+
+    TMA_config = (
+        r"""
+    // TMA configuration
+    // Note that this may only need to init once during inference (for different layers)
+    // Reuse the same traits for initializing tma descriptors.
+    fmha::ws::DMA<Ktraits>::Host dma_host;
+    dma_host.init_params(params, launch_params, stream);
+    """
+        if not generate_cu_trtllm
+        else ""
+    )
+    params_str = (
+        "reinterpret_cast<bert::Fused_multihead_attention_params_v2 &>(params)"
+        if generate_cu_trtllm
+        else "params"
+    )
+    attn_mask_type_str = (
+        "using Attention_mask_type = ContextAttentionMaskType;"
+        if generate_cu_trtllm
+        else "using Attention_mask_type = fmha::Attention_mask_type;"
+    )
+    bert_launch_params = (
+        ""
+        if generate_cu_trtllm
+        else "using Launch_params = bert::Fused_multihead_attention_launch_params;"
+    )
+    include_str = (
+        '#include "../fused_multihead_attention_common.h"' if generate_cu_trtllm else ""
+    )
+    num_compute_groups_str = (
+        "" if generate_cu_trtllm else "static constexpr int NUM_COMPUTE_GROUPS = 2;"
+    )
+    fused_multihead_attention_params_v2_str = (
+        "Fused_multihead_attention_params_v2"
+        if generate_cu_trtllm
+        else f"{params_type}"
+    )
+    const_fused_multihead_attention_params_v2_str = (
+        "Fused_multihead_attention_params_v2"
+        if generate_cu_trtllm
+        else f"const {params_type}"
+    )
+    setmaxnreg_dma_str = (
+        r"""
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900
+        const int DMA_REG_COUNT = {dma_reg_count};
+        asm volatile("{{setmaxnreg.dec.sync.aligned.u32  %0; \n\t}}" ::"n"(DMA_REG_COUNT));
+#else
+        asm volatile("trap;\n");
+#endif
+""".format(dma_reg_count=dma_reg_count)
+        if generate_cu_trtllm
+        else r"""
+        const int DMA_REG_COUNT = {dma_reg_count};
+        asm volatile("{{setmaxnreg.dec.sync.aligned.u32  %0; \n\t}}" ::"n"(DMA_REG_COUNT));""".format(
+            dma_reg_count=dma_reg_count
+        )
+    )
+    setmaxnreg_compute_str = (
+        r"""
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900
+        const int COMPUTE_REG_COUNT = {compute_reg_count};
+        asm volatile("{{setmaxnreg.inc.sync.aligned.u32 %0; \n\t}}" ::"n"(COMPUTE_REG_COUNT));
+#else
+        asm volatile("trap;\n");
+#endif
+""".format(compute_reg_count=compute_reg_count)
+        if generate_cu_trtllm
+        else r"""
+        const int COMPUTE_REG_COUNT = {compute_reg_count};
+        asm volatile("{{setmaxnreg.inc.sync.aligned.u32 %0; \n\t}}" ::"n"(COMPUTE_REG_COUNT));""".format(
+            compute_reg_count=compute_reg_count
+        )
+    )
+    local_ns_open = ns_open if generate_cu_trtllm else ""
+    local_ns_close = ns_close if generate_cu_trtllm else ""
+
+    tmp = dict(locals(), **kspec._asdict())
+
+    if effective_sm < 90:
+        if kspec.flash_attention:
+            code = flash_attention_kernel_template.format(
+                **tmp,
+                copyright=copyright,
+                use_multi_cta=False,
+                MAX_STGS_PER_LOOP=MAX_STGS_PER_LOOP,
+            )
+        else:
+            use_multi_cta = 1 if kspec.ctas_per_head > 1 else 0
+            code = kernel_template.format(
+                **tmp,
+                copyright=copyright,
+                use_multi_cta=use_multi_cta,
+                MAX_STGS_PER_LOOP=MAX_STGS_PER_LOOP,
+            )
+    elif effective_sm == 90:
+        use_tma = 1
+        if kspec.ldgsts_q:
+            use_tma = 0
+        if kspec.warp_specialization:
+            code = kernel_hopper_warp_specialization_template.format(
+                **tmp,
+                copyright=copyright,
+                use_tma=use_tma,
+                bytes_per_elt=dtype2bytes[kspec.dtype],
+            )
+        else:
+            code = kernel_hopper_template.format(
+                **tmp, copyright=copyright, use_tma=use_tma
+            )
+    return code
+
+
+def get_api_code(specs_names):
+    def get_signature(lname, version, cross_mha, use_tma):
+        # The architecture that determines the instruction.
+        effective_sm, sm_name = get_effective_sm_and_name(kspec)
+        if cross_mha:
+            return "void {}(const Params_mhca &params, cudaStream_t stream);".format(
+                lname
+            )
+        elif effective_sm >= 90:
+            # need to set tma desc in params
+            return "void {}(Params_v{} &params, const Launch_params &launch_params, cudaStream_t stream);".format(
+                lname, version
+            )
+        else:
+            return "void {}(const Params_v{} &params, const Launch_params &launch_params, cudaStream_t stream);".format(
+                lname, version
+            )
+
+    signatures = []
+    for kspec, _fname, lname, _kname in specs_names:
+        effective_sm, _ = get_effective_sm_and_name(kspec)
+        use_tma = effective_sm == 90 and not kspec.ldgsts_q
+        signatures.append(get_signature(lname, kspec.version, kspec.cross_mha, use_tma))
+        if kspec.has_noloop and not kspec.tiled:
+            signatures.append(
+                get_signature(lname + "_nl", kspec.version, kspec.cross_mha, use_tma)
+            )
+        elif kspec.tiled:
+            signatures.append(
+                get_signature(
+                    lname + "_nl_tiled", kspec.version, kspec.cross_mha, use_tma
+                )
+            )
+        if not kspec.warp_specialization:
+            signatures.append("void {}_get_max_heads_per_wave(int*);".format(lname))
+    signatures = "\n".join(signatures)
+
+    # v1
+    # - normal
+    # - no loop
+    # v2
+    # - normal
+    # - no loop
+    # - normal interleaved
+    # - no loop interleaved
+    # - flash attention no loop
+    # - flash attention no loop tiled
+    # - flash attention warp_specialized (on Hopper)
+
+    def gen_unroll_check(kspec):
+        code = "if (!{has_noloop} || (!force_unroll && (ignore_b1opt || b > {unroll_threshold})))".format(
+            **kspec._asdict()
+        )
+        if kspec.flash_attention:
+            code = "if (!{has_noloop} || (!force_unroll && (ignore_b1opt || b * h > {unroll_threshold})))".format(
+                **kspec._asdict()
+            )
+        return code
+
+    def gen_call(kspec, lname):
+        effective_sm, _ = get_effective_sm_and_name(kspec)
+        data_type = dtype2typename[kspec.dtype]
+        output_data_type = data_type
+        if kspec.output_dtype:
+            output_data_type = dtype2typename[kspec.output_dtype]
+        il_check = ""
+        if kspec.version == 2 and kspec.dtype in ["fp16", "bf16"]:
+            il_check += (
+                "&& use_flash_attention "
+                if kspec.flash_attention
+                else "&& !use_flash_attention "
+            )
+        if kspec.version == 2:
+            # attention input layout.
+            il_check += f"&& attention_input_layout == {kspec.input_layout.value} "
+            # interleaved layout or not.
+            il_check += "&& interleaved " if kspec.interleaved else "&& !interleaved "
+            if effective_sm == 90:
+                il_check += "&& !use_tma " if kspec.ldgsts_q else "&& use_tma "
+                il_check += (
+                    "&& warp_specialization "
+                    if kspec.warp_specialization
+                    else "&& !warp_specialization "
+                )
+            else:
+                il_check += "&& !warp_specialization && !use_tma "
+            # Different accumulation types.
+            if "_fp32" in kspec.dtype or "bf16" in kspec.dtype or kspec.dtype == "e4m3":
+                il_check += "&& force_fp32_acc "
+            else:
+                il_check += "&& !force_fp32_acc "
+            # whether support alibi or not.
+            if kspec.warp_specialization:
+                il_check += (
+                    "&& params.has_alibi " if kspec.alibi else "&& !params.has_alibi "
+                )
+                il_check += (
+                    "&& params.softmax_stats_ptr != nullptr "
+                    if kspec.return_softmax_stats
+                    else "&& params.softmax_stats_ptr == nullptr "
+                )
+            # use enable_attn_logit_softcapping or not.
+            il_check += (
+                "&& enable_attn_logit_softcapping "
+                if kspec.enable_attn_logit_softcapping
+                else "&& !enable_attn_logit_softcapping "
+            )
+            # check sage block sizes
+            sage_block_size_q = 0
+            sage_block_size_k = 0
+            sage_block_size_v = 0
+            if kspec.sage_block_sizes:
+                # override the data_type to output type, otherwise it is always E4M3
+                data_type = output_data_type
+                sage_block_size_q = kspec.sage_block_sizes[0]
+                sage_block_size_k = kspec.sage_block_sizes[1]
+                sage_block_size_v = kspec.sage_block_sizes[2]
+            il_check += (
+                f"&& sage_block_size_q == {sage_block_size_q} "
+                f"&& sage_block_size_k == {sage_block_size_k} "
+                f"&& sage_block_size_v == {sage_block_size_v} "
+            )
+
+        il_check += (
+            "&& params.use_int8_scale_max "
+            if kspec.has_scale_max
+            else "&& !params.use_int8_scale_max "
+        )
+
+        slen = kspec.seq_len * kspec.ctas_per_head if not kspec.flash_attention else 0
+
+        ## NOTE: need to tune here
+        if kspec.has_noloop and not kspec.flash_attention:
+            call_stmt = """\
+if( data_type == {data_type} && output_data_type == {output_data_type} && s == {slen} && d == {head_size} && sm == {sm}
+    {il_check}) {{
+
+    {unroll_check} {{
+        {lname}(params, launch_params, stream);
+    }} else {{
+        {lname}_nl(params, launch_params, stream);
+    }}
+
+}} """.format(
+                **kspec._asdict(),
+                data_type=data_type,
+                output_data_type=output_data_type,
+                slen=slen,
+                lname=lname,
+                il_check=il_check,
+                unroll_check=gen_unroll_check(kspec),
+            )
+
+        elif kspec.flash_attention:  # NOTE: flash attention uses no_loop as default
+            # TypeError: got multiple values for keyword argument if using key 'head_size_v', so 'dv' instead
+            dv = kspec.head_size_v or kspec.head_size
+            if kspec.tiled:  # higher precedence; does not require bh_upper_thres
+                call_stmt = """\
+if( data_type == {data_type} && output_data_type == {output_data_type} && d == {head_size} && dv == {dv} && sm == {sm}
+    {il_check} && use_tiled) {{
+
+    {lname}_nl_tiled(params, launch_params, stream);
+
+}} """.format(  # type: ignore[str-format]
+                    **kspec._asdict(),
+                    data_type=data_type,
+                    output_data_type=output_data_type,
+                    slen=slen,
+                    lname=lname,
+                    il_check=il_check,
+                    dv=dv,
+                )
+            # warp specialization kernels need launch_params
+            elif kspec.warp_specialization:
+                call_stmt = """\
+if( data_type == {data_type} && output_data_type == {output_data_type} && d == {head_size} && dv == {dv} && sm == {sm}
+    {il_check}) {{
+
+    {lname}(params, launch_params, stream);
+
+}} """.format(  # type: ignore[str-format]
+                    **kspec._asdict(),
+                    data_type=data_type,
+                    output_data_type=output_data_type,
+                    slen=slen,
+                    lname=lname,
+                    il_check=il_check,
+                    dv=dv,
+                )
+            else:
+                call_stmt = """\
+if( data_type == {data_type} && output_data_type == {output_data_type} && d == {head_size} && dv == {dv} && sm == {sm}
+    && !use_tiled {il_check}) {{
+
+    {lname}_nl(params, launch_params, stream);
+
+}} """.format(  # type: ignore[str-format]
+                    **kspec._asdict(),
+                    data_type=data_type,
+                    output_data_type=output_data_type,
+                    slen=slen,
+                    lname=lname,
+                    il_check=il_check,
+                    dv=dv,
+                )
+        else:
+            call_stmt = """\
+if( data_type == {data_type} && output_data_type == {output_data_type} && s == {slen} && d == {head_size} && sm == {sm}
+    {il_check}) {{
+
+    {lname}(params, launch_params, stream);
+
+}} """.format(
+                **kspec._asdict(),
+                data_type=data_type,
+                output_data_type=output_data_type,
+                slen=slen,
+                lname=lname,
+                il_check=il_check,
+            )
+        return call_stmt
+
+    def gen_call_fmhca(kspec, lname):
+        effective_sm, _ = get_effective_sm_and_name(kspec)
+        data_type = dtype2typename[kspec.dtype]
+        il_check = ""
+        if kspec.version == 2:
+            il_check = "&& interleaved " if kspec.interleaved else "&& !interleaved "
+        if effective_sm == 90:
+            il_check += "&& !use_tma " if kspec.ldgsts_q else "&& use_tma "
+        il_check += (
+            "&& params.use_int8_scale_max "
+            if kspec.has_scale_max
+            else "&& !params.use_int8_scale_max "
+        )
+
+        s_kv_len = kspec.seq_len
+        if kspec.has_noloop:
+            call_stmt = """\
+if( data_type == {data_type} && s_kv == {s_kv_len} && d == {head_size} && sm == {sm} {il_check}) {{
+
+    {unroll_check} {{
+        {lname}(params, stream);
+    }} else {{
+        {lname}_nl(params, stream);
+    }}
+
+}} """.format(
+                **kspec._asdict(),
+                data_type=data_type,
+                s_kv_len=s_kv_len,
+                lname=lname,
+                il_check=il_check,
+                unroll_check=gen_unroll_check(kspec),
+            )
+
+        else:
+            call_stmt = """\
+if( data_type == {data_type} && s_kv == {s_kv_len} && d == {head_size} && sm == {sm} {il_check}) {{
+        {lname}(params, stream);
+    }} """.format(
+                **kspec._asdict(),
+                data_type=data_type,
+                s_kv_len=s_kv_len,
+                lname=lname,
+                il_check=il_check,
+            )
+        return call_stmt
+
+    calls_v2 = [
+        gen_call(kspec, lname)
+        for kspec, fname, lname, kname in specs_names
+        if kspec.version == 2 and kspec.cross_mha == 0
+    ]
+
+    calls_v2 = "else ".join(calls_v2) if len(calls_v2) > 0 else "if( false ) {}"
+
+    calls_v1 = [
+        gen_call(kspec, lname)
+        for kspec, fname, lname, kname in specs_names
+        if kspec.version == 1 and kspec.cross_mha == 0
+    ]
+
+    calls_v1 = "else ".join(calls_v1) if len(calls_v1) > 0 else "if( false ) {}"
+
+    calls_mhca = [
+        gen_call_fmhca(kspec, lname)
+        for kspec, fname, lname, kname in specs_names
+        if kspec.cross_mha == 1
+    ]
+
+    calls_mhca = "else ".join(calls_mhca) if len(calls_mhca) > 0 else "if( false ) {}"
+
+    def gen_warp_spec(kspec):
+        data_type = dtype2typename[kspec.dtype]
+        if kspec.sage_block_sizes is not None:
+            assert kspec.output_dtype is not None
+            # override the data_type to output type, otherwise it is always E4M3
+            data_type = dtype2typename[kspec.output_dtype]
+        slen = kspec.seq_len * kspec.ctas_per_head
+        effective_sm, _ = get_effective_sm_and_name(kspec)
+        warp_spec_check = ""
+        nl_warps_m = kspec.warps_m if effective_sm == 90 else 1
+        nl_warps_n = (
+            kspec.warps_n if effective_sm == 90 else kspec.warps_m * kspec.warps_n
+        )
+        if kspec.version == 2 and kspec.dtype in ["fp16", "bf16"]:
+            warp_spec_check += (
+                "&& use_flash_attention "
+                if kspec.flash_attention
+                else "&& !use_flash_attention "
+            )
+        if kspec.version == 2:
+            if effective_sm == 90:
+                warp_spec_check += "&& !use_tma " if kspec.ldgsts_q else "&& use_tma "
+                warp_spec_check += (
+                    "&& warp_specialization "
+                    if kspec.warp_specialization
+                    else "&& !warp_specialization "
+                )
+            else:
+                warp_spec_check += "&& !use_tma && !warp_specialization "
+
+        if kspec.flash_attention:  # NOTE support any sequence
+            return """\
+if( data_type == {data_type} && d == {head_size} && sm == {sm} {warp_spec_check}
+    && version == {version} ) {{
+    warps_m = {warps_m};
+    warps_n = {warps_n};
+}} """.format(  # type: ignore[str-format]
+                **locals(), **kspec._asdict(), unroll_check=gen_unroll_check(kspec)
+            )
+        return """\
+if( data_type == {data_type} && s == {slen} && d == {head_size} && sm == {sm} {warp_spec_check}
+    && version == {version} ) {{
+    {unroll_check} {{
+      warps_m = {warps_m};
+      warps_n = {warps_n};
+    }} else {{
+      warps_m = {nl_warps_m};
+      warps_n = {nl_warps_n};
+    }}
+}} """.format(**locals(), **kspec._asdict(), unroll_check=gen_unroll_check(kspec))
+
+    warp_specs = "else ".join([gen_warp_spec(spec[0]) for spec in specs_names])
+    if len(warp_specs) > 0:
+        warp_specs += 'else {\n\tassert(false && "Unsupported config");\n}'
+
+    # Generate the cta spec.
+    def gen_cta_spec(spec):
+        kspec, _, lname, _ = spec
+        slen = kspec.seq_len * kspec.ctas_per_head
+        return """\
+if( data_type == {data_type} && s == {slen} && d == {head_size} && use_multi_ctas
+    && version == {version} ) {{
+
+    ctas_per_head = {ctas_per_head};
+    {lname}_get_max_heads_per_wave(&max_heads_per_wave);
+
+}} """.format(**locals(), **kspec._asdict(), data_type=dtype2typename[kspec.dtype])
+
+    cta_specs = "else ".join(
+        [gen_cta_spec(spec) for spec in specs_names if spec[0].ctas_per_head > 1]
+    )
+
+    api_code = """\
+{copyright}
+#pragma once
+
+#include <cuda.h>
+#include <fused_multihead_attention.h>
+#include <fused_multihead_cross_attention.h>
+#include <tuple>
+
+using Params_v1         = bert::Fused_multihead_attention_params_v1;
+using Params_v2         = bert::Fused_multihead_attention_params_v2;
+using Params_mhca       = bert::Fused_multihead_attention_params_mhca;
+using Launch_params     = bert::Fused_multihead_attention_launch_params;
+
+{signatures}
+
+inline void run_fmha_v1(Params_v1 &params,
+                        const Launch_params &launch_params,
+                        Data_type data_type,
+                        Data_type output_data_type,
+                        int sm,
+                        cudaStream_t stream=0){{
+const size_t s                 = params.s;
+const size_t b                 = params.b;
+const size_t d                 = params.d;
+const bool force_unroll        = launch_params.force_unroll;
+const bool ignore_b1opt        = launch_params.ignore_b1opt;
+
+const bool use_flash_attention = false;
+
+{calls_v1}
+else {{
+    assert(false && "Unsupported config.");
+}}
+
+}}
+
+// Note: transitioning to moving kernel launch parameters into launch_params to reduce the
+// occurrences the interface needs to be modified
+inline void run_fmha_v2(Params_v2 &params,
+                        const Launch_params &launch_params,
+                        Data_type data_type,
+                        Data_type output_data_type,
+                        int sm,
+                        cudaStream_t stream=0) {{
+
+const size_t s = params.s;
+const size_t b = params.b;
+const size_t h = params.h;
+const size_t d = params.d;
+const size_t dv = params.dv;
+const size_t sage_block_size_q = params.sage.q.block_size;
+const size_t sage_block_size_k = params.sage.k.block_size;
+const size_t sage_block_size_v = params.sage.v.block_size;
+
+const bool interleaved                       = launch_params.interleaved;
+const bool force_unroll                      = launch_params.force_unroll;
+const bool ignore_b1opt                      = launch_params.ignore_b1opt;
+const bool force_fp32_acc                    = launch_params.force_fp32_acc;
+const bool warp_specialization               = launch_params.warp_specialization;
+const bool use_tma                           = launch_params.use_tma;
+const bool use_flash_attention               = launch_params.flash_attention;
+const bool enable_attn_logit_softcapping     = launch_params.enable_attn_logit_softcapping;
+const int  attention_input_layout            = static_cast<int>(launch_params.attention_input_layout);
+// tiled variant uses ldgsts
+const bool  use_tiled            = launch_params.use_granular_tiling;
+
+{calls_v2}
+else {{
+    assert(false && "Unsupported config.");
+}}
+
+}}
+
+#if __guard_fmhca_placeholder__ // fmhca api header
+
+inline void run_fmhca(Params_mhca &params,
+                      const Launch_params &launch_params,
+                      Data_type data_type,
+                      int sm,
+                      cudaStream_t stream=0) {{
+
+const size_t s_kv   = params.s;
+const size_t b      = params.b;
+const size_t d      = params.d_padded;
+
+const bool interleaved  = launch_params.interleaved;
+const bool force_unroll = launch_params.force_unroll;
+const bool ignore_b1opt = launch_params.ignore_b1opt;
+
+{calls_mhca}
+else {{
+    assert(false && "Unsupported config");
+}}
+
+}}
+
+#endif // fmhca api header
+
+inline std::tuple<size_t, size_t, size_t> get_warps(Launch_params& launch_params,
+                                                    int sm,
+                                                    Data_type data_type,
+                                                    size_t s,
+                                                    size_t b,
+                                                    size_t d,
+                                                    int version) {{
+    size_t warps_m, warps_n, warps_k = 1;
+    const bool interleaved           = launch_params.interleaved;
+    const bool use_tma               = launch_params.use_tma;
+    const bool force_unroll          = launch_params.force_unroll;
+    const bool ignore_b1opt          = launch_params.ignore_b1opt;
+    const bool use_flash_attention   = launch_params.flash_attention;
+    // tiled variant uses ldgsts
+    const bool use_tiled             = launch_params.use_granular_tiling;
+    const bool warp_specialization   = launch_params.warp_specialization;
+
+{warp_specs}
+
+    return std::make_tuple(warps_m, warps_n, warps_k);
+}}
+
+// The constant is defined in "setup.py".
+constexpr int MAX_STGS_PER_LOOP = {MAX_STGS_PER_LOOP};
+
+// The number of CTAs and threads per CTA to launch the kernel.
+inline void get_grid_size(int &heads_per_wave,
+                          int &ctas_per_head,
+                          int sm,
+                          Data_type data_type,
+                          size_t b,
+                          size_t s,
+                          size_t h,
+                          size_t d,
+                          bool use_multi_ctas,
+                          int version) {{
+
+    // Determine the number of CTAs per head (kernel constant).
+    int max_heads_per_wave = 0;
+    ctas_per_head = 1;
+    heads_per_wave = b*h;
+{cta_specs}
+
+    // Adjust the number of heads per wave.
+    if( heads_per_wave > max_heads_per_wave ) {{
+        heads_per_wave = max_heads_per_wave;
+    }}
+}}
+
+""".format(**locals(), copyright=copyright, MAX_STGS_PER_LOOP=MAX_STGS_PER_LOOP)
+    return api_code
+
+
+ktraits_code_template = """
+#include "fused_multihead_attention_kernel.h"
+#include "fmha/kernel_traits.h"
+#include "fmha/hopper/kernel_traits.h"
+#include <fmha/warpspec/kernel_traits.h>
+
+using namespace fmha;
+
+int main(){{
+{print_kernel_specs}
+}}
+"""
+
+
+def get_kernel_traits_code(specs_names):
+    print_kernel_specs = []
+
+    for kspec, fname, lname, kname in specs_names:  # noqa: B007 (fname, lname used via locals())
+        effective_sm, sm_name = get_effective_sm_and_name(kspec)
+        if effective_sm < 90:
+            instruction_traits = sm_name.capitalize() + "_" + dtype2traits[kspec.dtype]
+        elif effective_sm == 90:
+            instruction_traits = (
+                sm_name.capitalize() + "_" + hopper_dtype2traits[kspec.dtype]
+            )
+            instruction_traits_p, instruction_traits_o = get_hopper_instruction_traits(
+                instruction_traits, kspec
+            )
+
+        if effective_sm < 90:
+            kernel_traits = "Kernel_traits_"
+        elif effective_sm == 90:
+            kernel_traits = "FMHA_kernel_traits_hopper_"
+
+        if kspec.interleaved:
+            kernel_traits += "interleaved_v2"
+        elif kspec.cross_mha:
+            kernel_traits += "fmhca"
+        else:
+            kernel_traits += "v{}".format(kspec.version)
+
+        # needed by warpspec kernels.
+        fp8_kernel = kspec.dtype in ["e4m3", "e4m3_fp32"]
+        kernel_traits_header = (
+            "fmha::ws::Kernel_traits_Hopper_qgmma_e4m3_fp32<"
+            if fp8_kernel
+            else f"fmha::ws::Kernel_traits<fmha::{instruction_traits},"
+        )
+
+        flags = 0
+        if kspec.ldgsts_q:
+            flags |= 1
+        if kspec.ldgsts_k:
+            flags |= 2
+        if kspec.ldgsts_v:
+            flags |= 4
+        if kspec.share_smem_k_v:
+            flags |= 8
+        if kspec.has_scale_max:
+            flags |= 16
+        if not kspec.head_interleaved:
+            flags |= 32
+        if kspec.limit_qk_fragments:
+            flags |= 128
+        if kspec.limit_qk_fragments:
+            flags |= 256
+        if kspec.has_noloop:
+            # NOTE do not use flags 512 = 0x200 as it is reserved; do not add to flags because it
+            # will be selectively added to no-loop kernel trait upon generating .cu templates
+            pass
+        if kspec.enable_attn_logit_softcapping:
+            flags |= 2048
+        if kspec.tiled:
+            flags |= 4096
+        if kspec.is_mtp:
+            flags |= 8192
+
+        kernel_flags = "0x{:02x}u".format(flags)
+
+        heads_interleaved_flag = pythonBoolean2cpp[kspec.head_interleaved]
+
+        enable_mutex_flag = enable_mutex(kspec)
+
+        has_alibi = pythonBoolean2cpp[kspec.alibi]
+
+        return_softmax_stats_flag = pythonBoolean2cpp[kspec.return_softmax_stats]
+
+        input_layout_flag = str(int(kspec.input_layout))
+
+        enable_attn_logit_softcapping_flag = pythonBoolean2cpp[
+            kspec.enable_attn_logit_softcapping
+        ]
+
+        tmp = dict(locals(), **kspec._asdict())
+
+        if effective_sm < 90:
+            snippet = """    {{
+            using Kernel_traits = {kernel_traits}<
+                fmha::{instruction_traits},
+                {seq_len},
+                {head_size},
+                {head_size_v},
+                {loop_step},
+                {warps_m},
+                {warps_n},
+                {ctas_per_head},
+                {kernel_flags}>;
+            printf("%s %d %d %s %d %d\\n",
+                \"{kernel_name}\",
+                Kernel_traits::BYTES_PER_SMEM,
+                Kernel_traits::THREADS,
+                \"{fname}\",
+                {loop_step},
+                {unroll_threshold});
+        }}""".format(**tmp, kernel_name=kname.replace("__placeholder__", ""))
+            snippet_nl = """    {{
+            using Kernel_traits = {kernel_traits}<
+                fmha::{instruction_traits},
+                {seq_len},
+                {head_size},
+                {head_size_v},
+                {noloop_step},
+                1,
+                {warps_m} * {warps_n},
+                {ctas_per_head},
+                {kernel_flags} | 0x200 /* no_loop flag */>;
+            printf("%s %d %d %s %d %d\\n",
+                \"{kernel_name}_nl\",
+                Kernel_traits::BYTES_PER_SMEM,
+                Kernel_traits::THREADS,
+                \"{fname}\",
+                {noloop_step},
+                {unroll_threshold});
+        }}""".format(**tmp, kernel_name=kname.replace("__placeholder__", ""))
+            snippet_flash = """    {{
+            using Kernel_traits = {kernel_traits}<
+                fmha::{instruction_traits},
+                {kv_loop_step},
+                {head_size},
+                {head_size_v},
+                {loop_step},
+                {warps_m},
+                {warps_n},
+                {ctas_per_head},
+                {kernel_flags}>;
+            printf("%s %d %d %s %d %d\\n",
+                \"{kernel_name}\",
+                Kernel_traits::BYTES_PER_SMEM,
+                Kernel_traits::THREADS,
+                \"{fname}\",
+                {loop_step},
+                {unroll_threshold});
+        }}""".format(**tmp, kernel_name=kname.replace("__placeholder__", ""))
+            snippet_flash_nl_template = """    {{
+            using Kernel_traits = {kernel_traits}<
+                fmha::{instruction_traits},
+                {kv_loop_step},
+                {head_size},
+                {head_size_v},
+                {noloop_step},
+                {warps_m},
+                {warps_n},
+                {ctas_per_head},
+                {kernel_flags} | 0x200 /* no_loop flag */>;
+            printf("%s %d %d %s %d %d\\n",
+                \"{kname}_nl\",
+                Kernel_traits::BYTES_PER_SMEM,
+                Kernel_traits::THREADS,
+                \"{fname}\",
+                {noloop_step},
+                {unroll_threshold});
+        }}""".format(**tmp)
+            snippet_flash_nl = snippet_flash_nl_template.replace("__placeholder__", "")
+            snippet_flash_nl_tiled = snippet_flash_nl_template.replace(
+                "__placeholder__", ""
+            ).replace("_nl", "_nl_tiled")
+            snippet_flash_nl_causal = snippet_flash_nl_template.replace(
+                "__placeholder__", "_causal"
+            )
+            snippet_flash_nl_tiled_causal = snippet_flash_nl_template.replace(
+                "__placeholder__", "_causal"
+            ).replace("_nl", "_nl_tiled")
+            snippet_flash_nl_sliding_or_chunked_causal = (
+                snippet_flash_nl_template.replace(
+                    "__placeholder__", "_sliding_or_chunked_causal"
+                )
+            )
+            snippet_flash_nl_tiled_sliding_or_chunked_causal = (
+                snippet_flash_nl_template.replace(
+                    "__placeholder__", "_sliding_or_chunked_causal"
+                ).replace("_nl", "_nl_tiled")
+            )
+            snippet_flash_nl_custom_mask = snippet_flash_nl_template.replace(
+                "__placeholder__", "_custom_mask"
+            )
+            snippet_flash_nl_tiled_custom_mask = snippet_flash_nl_template.replace(
+                "__placeholder__", "_custom_mask"
+            ).replace("_nl", "_nl_tiled")
+        elif effective_sm >= 90 and kspec.warp_specialization:  # GMMA warpspec flash
+            snippet_ws_template = """ {{
+            static constexpr int DMA2COMPUTE_DEPTH = 1;
+            static constexpr int NUM_COMPUTE_GROUPS = 2;
+
+            using Kernel_traits = {kernel_traits_header}
+                                  {loop_step},
+                                  {kv_loop_step},
+                                  {head_size},
+                                  {head_size_v},
+                                  {q_tile_buffers},
+                                  {kv_tile_buffers},
+                                  NUM_COMPUTE_GROUPS,
+                                  DMA2COMPUTE_DEPTH,
+                                  mask_type,
+                                  {heads_interleaved_flag},
+                                  {has_alibi},
+                                  {enable_mutex_flag},
+                                  {scheduling_mode},
+                                  {input_layout_flag},
+                                  __use_tma_store__ /* USE_TMA_STORE */,
+                                  {enable_attn_logit_softcapping_flag},
+                                  {return_softmax_stats_flag}>;
+
+            printf("%s %d %d %s %d %d\\n",
+                \"{kname}\",
+                Kernel_traits::BYTES_PER_SMEM,
+                Kernel_traits::THREADS,
+                \"{fname}\",
+                {loop_step},
+                {unroll_threshold});
+        }}""".format(**tmp)
+            snippet_ws = (
+                snippet_ws_template.replace("__placeholder__", "")
+                .replace("mask_type", "0")
+                .replace("__use_tma_store__", "true")
+            )
+            snippet_ws_causal = (
+                snippet_ws_template.replace("__placeholder__", "_causal")
+                .replace("mask_type", "1")
+                .replace("__use_tma_store__", "true")
+            )
+            snippet_ws_sliding_or_chunked_causal = (
+                snippet_ws_template.replace(
+                    "__placeholder__", "_sliding_or_chunked_causal"
+                )
+                .replace("mask_type", "2")
+                .replace("__use_tma_store__", "false")
+            )
+            snippet_ws_custom_mask = (
+                snippet_ws_template.replace("__placeholder__", "_custom_mask")
+                .replace("mask_type", "2")
+                .replace("__use_tma_store__", "true")
+            )
+        elif effective_sm >= 90:  # GMMA no flash yet
+            snippet_template = """    {{
+            using Traits_p = fmha::{instruction_traits_p};
+            using Traits_o = fmha::{instruction_traits_o};
+
+            using Kernel_traits = {kernel_traits}<
+                Traits_p,
+                Traits_o,
+                {seq_len},
+                {head_size},
+                {loop_step},
+                {warps_m},
+                {warps_n},
+                2,
+                {kernel_flags}>;
+            printf("%s %d %d %s %d %d\\n",
+                \"{kname}\",
+                Kernel_traits::BYTES_PER_SMEM,
+                Kernel_traits::THREADS,
+                \"{fname}\",
+                {loop_step},
+                {unroll_threshold});
+        }}""".format(**tmp)
+            snippet_nl_template = """    {{
+            using Traits_p = fmha::{instruction_traits_p};
+            using Traits_o = fmha::{instruction_traits_o};
+
+            using Kernel_traits = {kernel_traits}<
+                Traits_p,
+                Traits_o,
+                {seq_len},
+                {head_size},
+                {noloop_step},
+                {warps_m},
+                {warps_n},
+                2,
+                {kernel_flags}>;
+            printf("%s %d %d %s %d %d\\n",
+                \"{kname}_nl\",
+                Kernel_traits::BYTES_PER_SMEM,
+                Kernel_traits::THREADS,
+                \"{fname}\",
+                {noloop_step},
+                {unroll_threshold});
+        }}""".format(**tmp)
+            snippet = snippet_template.replace("__placeholder__", "")
+            snippet_causal = snippet_template.replace(
+                "__placeholder__", "_sliding_or_chunked_causal"
+            )
+            snippet_sliding_or_chunked_causal = snippet_template.replace(
+                "__placeholder__", "_causal"
+            )
+            snippet_nl = snippet_nl_template.replace("__placeholder__", "")
+            snippet_nl_causal = snippet_nl_template.replace(
+                "__placeholder__", "_causal"
+            )
+            snippet_nl_sliding_or_chunked_causal = snippet_nl_template.replace(
+                "__placeholder__", "_sliding_or_chunked_causal"
+            )
+
+        # only generate certain needed combinations of input_layout and mask types for trt-llm.
+        selected_types = selected_mask_types(kspec)
+
+        padding_mask = int(selected_types[0])
+        causal_mask = int(selected_types[1])
+        sliding_or_chunked_causal_mask = int(selected_types[2])
+        custom_mask = int(selected_types[3])
+
+        if not padding_mask:
+            snippet = None
+            snippet_nl = None
+            snippet_ws = None
+            snippet_flash_nl = None
+            snippet_flash_nl_tiled = None
+        if not causal_mask:
+            snippet_causal = None
+            snippet_nl_causal = None
+            snippet_ws_causal = None
+            snippet_flash_nl_causal = None
+            snippet_flash_nl_tiled_causal = None
+        if not sliding_or_chunked_causal_mask:
+            snippet_sliding_or_chunked_causal = None
+            snippet_nl_sliding_or_chunked_causal = None
+            snippet_ws_sliding_or_chunked_causal = None
+            snippet_flash_nl_sliding_or_chunked_causal = None
+            snippet_flash_nl_tiled_sliding_or_chunked_causal = None
+        if not custom_mask:
+            snippet_ws_custom_mask = None
+            snippet_flash_nl_custom_mask = None
+            snippet_flash_nl_tiled_custom_mask = None
+
+        if kspec.flash_attention:
+            pass
+            # print_kernel_specs.append(snippet_flash) # disabled as looped flash performs worse
+        else:
+            print_kernel_specs.append(snippet)
+            if "snippet_causal" in locals():
+                print_kernel_specs.append(snippet_causal)
+            if "snippet_sliding_or_chunked_causal" in locals():
+                print_kernel_specs.append(snippet_sliding_or_chunked_causal)
+        if kspec.has_noloop:
+            if kspec.flash_attention and kspec.tiled == 1:
+                print_kernel_specs.append(snippet_flash_nl_tiled)
+                print_kernel_specs.append(snippet_flash_nl_tiled_causal)
+                print_kernel_specs.append(
+                    snippet_flash_nl_tiled_sliding_or_chunked_causal
+                )
+                print_kernel_specs.append(snippet_flash_nl_tiled_custom_mask)
+            elif kspec.flash_attention and kspec.tiled == 0:
+                print_kernel_specs.append(snippet_flash_nl)
+                print_kernel_specs.append(snippet_flash_nl_causal)
+                print_kernel_specs.append(snippet_flash_nl_sliding_or_chunked_causal)
+                print_kernel_specs.append(snippet_flash_nl_custom_mask)
+            else:
+                print_kernel_specs.append(snippet_nl)
+                if "snippet_nl_causal" in locals():
+                    print_kernel_specs.append(snippet_nl_causal)
+                if "snippet_nl_sliding_or_chunked_causal" in locals():
+                    print_kernel_specs.append(snippet_nl_sliding_or_chunked_causal)
+
+        if kspec.warp_specialization:
+            print_kernel_specs.append(snippet_ws)
+            print_kernel_specs.append(snippet_ws_causal)
+            print_kernel_specs.append(snippet_ws_sliding_or_chunked_causal)
+            print_kernel_specs.append(snippet_ws_custom_mask)
+    # remove none.
+    print_kernel_specs = [spec for spec in print_kernel_specs if spec is not None]
+    print_kernel_specs = "\n".join(print_kernel_specs)
+
+    code = ktraits_code_template.format(print_kernel_specs=print_kernel_specs)
+    return code
+
+
+# For now:
+# 1. Hopper head_size 128 kernel uses cubins for performance regressions.
+# 2. Hopper sm89 with e4m3/e4m3_fp32 dtype uses cubins for accuracy regressions (will be fixed).
+# You should set the condition `use_cubin_header` to false if you have modified the source codes of those kernels that use cubins.
+# This ensures that the kernels will be recompiled using the updated source code rather than relying on precompiled cubins.
+def use_cubin_header(sm, head_size, dtype):
+    return (sm == 90 and head_size == 128) or (sm == 89 and "e4m3" in dtype)
+
+
+def get_cubin_header(kernel_traits, specs_names):
+    cubins = []
+    cubin_lens = []
+    cubins_dict = {}
+    cubin_lens_dict = {}
+    for kspec, fname, lname, kname in specs_names:  # noqa: B007 (lname, kname used via locals())
+        if generate_cu_trtllm and not use_cubin_header(
+            kspec.sm, kspec.head_size, kspec.dtype
+        ):
+            continue
+        name = fname.replace(".", "_")
+        data = "extern unsigned char cubin_{name}_cubin[];".format(name=name)
+        size = "extern uint32_t cubin_{name}_cubin_len;".format(name=name)
+        if kspec.sm in cubins_dict:
+            cubins_dict[kspec.sm].append(data)
+            cubin_lens_dict[kspec.sm].append(size)
+        else:
+            cubins_dict[kspec.sm] = [data]
+            cubin_lens_dict[kspec.sm] = [size]
+
+    metadata_v1 = []
+    # Only metadata_v2 is used by TRT-LLM.
+    metadata_v2 = []
+    metadata_v2_dict = {}
+    unroll_config_v1 = []
+    unroll_config_v2 = []
+    for kname, smem, threads, fname, unroll_step, unroll_threshold in kernel_traits:  # noqa: B007 (smem, threads, unroll_threshold used via locals())
+        name = fname.replace(".", "_")
+        cubin_name = "cubin_{name}_cubin".format(name=name)
+        kname_remove_causal = kname.replace("_causal", "")
+        tname = (
+            kname.replace("flash_attention_", "")
+            .replace("_scale_max", "")
+            .replace("_nl", "")
+            .replace("_tiled", "")
+            .replace("tma_", "")
+            .replace("ldgsts_", "")
+            .replace("causal_", "")
+            .replace("alibi_", "")
+            .replace("softmax_", "")
+            .replace("sliding_or_chunked_", "")
+            .replace("custom_mask_", "")
+            .replace("qkv_", "")
+            .replace("q_kv_", "")
+            .replace("q_paged_kv_", "")
+            .replace("q_k_v_", "")
+            .replace("ws_", "")
+            .replace("softcapping_", "")
+            .replace("sage_", "")
+            .replace("output_", "")
+        )
+        flash_attention = "flash_attention" in kname
+        warp_specialization = "tma_ws" in kname
+        toks = tname.split("_")
+        #   0  1                x         -7  -6  -5-4-3   -2   -1   x
+        # fmha_v2(_flash_attention)_fp16(_fp32)_64_64_S_16_sm80_kernel(_nl)
+        #   0  1  2            -5 -4 -3  -2  -1
+        # fmha_v2_il_fp16(_fp32)_64_64_sm80_kernel
+        # print(kname)
+        version = toks[1][1]
+        sm = toks[-2][2:]
+        if "_output" in kname:
+            output_prec = toks[-3].upper()
+            toks.pop(-3)
+        else:
+            output_prec = None
+        if "_sage_" in kname:
+            # example:
+            # kname: fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_128_bf16_tma_ws_sm90_kernel
+            # tname: fmha_v2_e4m3_64_256_S_128_sage_64_64_128_bf16_sm90_kernel
+            sage_block_sizes = toks[-5:-2]
+            toks.pop(-5)
+            toks.pop(-4)
+            toks.pop(-3)
+        else:
+            sage_block_sizes = (0, 0, 0)
+        head_size = toks[-3]
+        if "x" in head_size:
+            (head_size, head_size_v) = head_size.split("x")
+        else:
+            head_size_v = head_size
+        # flash attention kernel encodes variable seqlen as S, but only number 0 fits in the metadata struct
+        seq_len = 0 if toks[-4] == "S" else toks[-4]
+        q_step = unroll_step
+        kv_step = seq_len
+        if flash_attention:
+            kv_step = toks[-5]
+            q_step = toks[-6]
+        prec = toks[-5].upper()
+        is_fp32_accu = "false"
+        if flash_attention:
+            prec = toks[-7].upper()
+            # fp16_fp32 --> HMMA with FP32 accumulation
+            if toks[-8].upper() in ["E4M3", "E5M2", "FP16", "BF16"]:
+                if prec == "FP32":
+                    is_fp32_accu = "true"
+                prec = toks[-8].upper()
+
+        elif toks[-6].upper() in ["E4M3", "E5M2", "FP16", "BF16", "FP32"]:
+            # in this case, toks[-6] = data type, toks[-5] = acc data type
+            prec = toks[-5].upper()
+            # fp16_fp32 --> HMMA with FP32 accumulation
+            if toks[-6].upper() in ["E4M3", "E5M2", "FP16", "BF16"]:
+                if prec == "FP32":
+                    is_fp32_accu = "true"
+                prec = toks[-6].upper()
+
+        # fp8 or bf16 always accumulates on fp32
+        if prec in ["E4M3", "E5M2", "BF16"]:
+            is_fp32_accu = "true"
+        if output_prec is None:
+            output_prec = prec
+
+        is_il = pythonBoolean2cpp["_il" in kname]
+        attention_mask_type = AttentionMaskType.PADDING
+        is_tiled = pythonBoolean2cpp["_tiled" in kname]
+
+        # Attention mask type:
+        # padding (0), causal_mask (1), sliding_or_chunked_causal_mask (2), custom_mask (3).
+        if "_custom_mask" in kname:
+            attention_mask_type = AttentionMaskType.CUSTOM_MASK
+        elif "_sliding_or_chunked_causal" in kname:
+            attention_mask_type = AttentionMaskType.SLIDING_OR_CHUNKED_CAUSAL
+        elif "_causal" in kname:
+            attention_mask_type = AttentionMaskType.CAUSAL
+
+        attention_mask_type_value = attention_mask_type.value
+
+        # Attention input layout:
+        # packed_qkv (0), contiguous_q_kv (1), q_paged_kv (2), separate_q_k_v (3).
+        attention_input_layout = InputLayout.PACKED_QKV
+        if "_q_kv" in kname:
+            attention_input_layout = InputLayout.CONTIGUOUS_Q_KV
+        elif "_q_paged_kv" in kname:
+            attention_input_layout = InputLayout.Q_PAGED_KV
+        elif "_q_k_v" in kname:
+            attention_input_layout = InputLayout.SEPARATE_Q_K_V
+
+        attention_input_layout_value = attention_input_layout.value
+
+        # hopper warpspecialized kernels have specialized ones for cases without alibi.
+        is_alibi_supported = pythonBoolean2cpp["_ws" not in kname or "_alibi" in kname]
+
+        return_softmax_stats_flag = pythonBoolean2cpp[
+            sm != "90" or (sm == "90" and "_softmax" in kname)
+        ]
+
+        # meta_unroll_step
+        meta_unroll_step = unroll_step if ("_nl" in kname or "_ws" in kname) else "0"
+
+        is_flash_atten = pythonBoolean2cpp[flash_attention]
+
+        is_warp_specialization = pythonBoolean2cpp[warp_specialization]
+
+        has_softcapping_scale = "true" if "softcapping" in kname else "false"
+
+        unroll_spec = """\
+{{ kSM_{sm}, DATA_TYPE_{prec}, {seq_len}, {head_size}, {unroll_threshold} }}\
+""".format(**locals())
+
+        if "v1" in kname:
+            code = """\
+{{ DATA_TYPE_{prec}, {seq_len}, {head_size}, kSM_{sm},  {cubin_name}, {cubin_name}_len, \"{kname}\", {smem}, {threads} }}\
+""".format(**locals())
+            metadata_v1.append(code)
+            if "_nl" in kname:
+                unroll_config_v1.append(unroll_spec)
+        elif "v2" in kname:
+            if generate_cu_trtllm:
+
+                def get_lname_from_kname(kname: str) -> str:
+                    if use_cubin_header(int(sm), int(head_size), prec.lower()):
+                        return "nullptr"
+                    lname = kname.replace("_kernel", "")
+                    mask_types = [
+                        "_sliding_or_chunked_causal",
+                        "_custom_mask",
+                        "_causal",
+                    ]
+                    for mask_type in mask_types:
+                        lname = lname.replace(mask_type, "")
+                    lname = "run_" + lname
+
+                    return lname
+
+                lname = get_lname_from_kname(kname)
+                code = (
+                    """\
+{{ DATA_TYPE_{prec}, DATA_TYPE_{output_prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, {head_size_v}, \
+{sage_block_sizes[0]}, {sage_block_sizes[1]}, {sage_block_sizes[2]}, kSM_{sm}, {cubin_name}, \
+{cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \
+{attention_input_layout_value}, {is_il}, {is_flash_atten}, {is_warp_specialization}, {is_fp32_accu}, \
+{is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {lname}}}\
+""".format(**locals())
+                    if use_cubin_header(int(sm), int(head_size), prec.lower())
+                    else """\
+{{ DATA_TYPE_{prec}, DATA_TYPE_{output_prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, {head_size_v}, \
+{sage_block_sizes[0]}, {sage_block_sizes[1]}, {sage_block_sizes[2]}, kSM_{sm}, nullptr, \
+0, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \
+{attention_input_layout_value}, {is_il}, {is_flash_atten}, {is_warp_specialization}, {is_fp32_accu}, \
+{is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {lname}}}\
+""".format(**locals())
+                )
+            else:
+                code = """\
+{{ DATA_TYPE_{prec}, DATA_TYPE_{output_prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, {head_size_v}, \
+{sage_block_sizes[0]}, {sage_block_sizes[1]}, {sage_block_sizes[2]}, kSM_{sm}, {cubin_name}, \
+{cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \
+{attention_input_layout_value}, {is_il}, {is_flash_atten}, {is_warp_specialization}, {is_fp32_accu}, \
+{is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}}}\
+""".format(**locals())
+            if sm in metadata_v2_dict:
+                metadata_v2_dict[sm].append(code)
+            else:
+                metadata_v2_dict[sm] = [code]
+            if "_nl" in kname:
+                unroll_config_v2.append(unroll_spec)
+            if generate_cu_trtllm and lname != "nullptr":
+                launcher = "extern void {lname}(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);".format(
+                    lname=lname
+                )
+                if int(sm) in cubins_dict:
+                    if launcher not in cubins_dict[int(sm)]:
+                        cubins_dict[int(sm)].append(launcher)
+                else:
+                    cubins_dict[int(sm)] = [launcher]
+        elif "mhca" in kname:
+            code = """\
+{{ DATA_TYPE_{prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, kSM_{sm},  {cubin_name}, {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {is_il} }}\
+""".format(**locals())
+            metadata_v2.append(code)
+        else:
+            raise AssertionError("Something terrible happened")
+
+    metadata_v1 = ",\n".join(metadata_v1)
+    # Add macros to only include needed cubins during compilation.
+    if bool(metadata_v2_dict):
+        metadata_v2 = ""
+        for sm in metadata_v2_dict.keys():
+            macro_begin = f"#ifndef EXCLUDE_SM_{sm}"
+            macro_end = "#endif\n\n"
+            metadata_v2 += macro_begin + "\n" + (",\n".join(metadata_v2_dict[sm]))
+            last_key = list(metadata_v2_dict.keys())[-1]
+            metadata_v2 += ("" if sm == last_key else ",") + "\n" + macro_end
+    else:
+        metadata_v2 = ",\n".join(metadata_v2)
+    # Add macros to only include needed cubins during compilation.
+    for sm in cubins_dict.keys():
+        macro_begin = f"#ifndef EXCLUDE_SM_{sm}"
+        macro_end = "#endif\n"
+        cubins.extend([macro_begin] + cubins_dict[sm] + [macro_end])
+        if sm in cubin_lens_dict:
+            cubin_lens.extend([macro_begin] + cubin_lens_dict[sm] + [macro_end])
+
+    unroll_config_v1 = ",\n".join(unroll_config_v1)
+    unroll_config_v2 = ",\n".join(unroll_config_v2)
+    cubins = "\n".join(cubins)
+    cubin_lens = "\n".join(cubin_lens)
+    local_ns_open = ns_open
+    local_ns_close = ns_close if generate_cu_trtllm else "}"
+    launcher_line = (
+        """
+    void (*launcher)(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);"""
+        if generate_cu_trtllm
+        else ""
+    )
+    if "GENERATE_CUBIN" in os.environ:
+        code = """\
+{copyright}
+#pragma once
+
+{local_ns_open}
+
+{cubins}
+
+{cubin_lens}
+
+static const struct FusedMultiHeadAttentionKernelMetaInfoV2
+{{
+    Data_type mDataTypeIn;
+    Data_type mDataTypeOut;
+    unsigned int mS;
+    unsigned int mStepQ;
+    unsigned int mStepKV;
+    unsigned int mD;
+    unsigned int mDV;
+    unsigned int mSageBlockSizeQ;
+    unsigned int mSageBlockSizeK;
+    unsigned int mSageBlockSizeV;
+    unsigned int mSM;
+    const unsigned char* mCubin;
+    unsigned int mCubinSize;
+    const char* mFuncName;
+    unsigned int mSharedMemBytes;
+    unsigned int mThreadsPerCTA;
+    unsigned int mUnrollStep;
+    int mAttentionMaskType;
+    int mAttentionInputLayout;
+    bool mInterleaved;
+    bool mFlashAttention;
+    bool mWarpSpecialization;
+    bool mFP32Accumulation;
+    bool mAlibiSupported;
+    bool mTiled;
+    bool mEnableAttnLogitSoftcapping;
+    bool mReturnSoftmaxStats;{launcher_line}
+}} sMhaKernelMetaInfosV2[] = {{
+{metadata_v2}
+}};
+{local_ns_close}
+
+""".format(**locals(), copyright=copyright)
+
+    else:
+        code = """\
+{copyright}
+#pragma once
+
+{cubins}
+
+{cubin_lens}
+
+static const struct TestMetaV1
+{{
+    Data_type mDataType;
+    unsigned int mS;
+    unsigned int mD;
+    unsigned int mSM;
+    const unsigned char* mCubin;
+    unsigned int mCubinSize;
+    const char* mFuncName;
+    unsigned int mSharedMemBytes;
+    unsigned int mThreadsPerCTA;
+}} metaV1[] = {{
+{metadata_v1}
+}};
+
+static const struct TestMetaV2
+{{
+    Data_type mDataTypeIn;
+    Data_type mDataTypeOut;
+    unsigned int mS;
+    unsigned int mStepQ;
+    unsigned int mStepKV;
+    unsigned int mD;
+    unsigned int mDV;
+    unsigned int mSageBlockSizeQ;
+    unsigned int mSageBlockSizeK;
+    unsigned int mSageBlockSizeV;
+    unsigned int mSM;
+    const unsigned char* mCubin;
+    unsigned int mCubinSize;
+    const char* mFuncName;
+    unsigned int mSharedMemBytes;
+    unsigned int mThreadsPerCTA;
+    unsigned int mUnrollStep;
+    int mAttentionMaskType;
+    int mAttentionInputLayout;
+    bool mInterleaved;
+    bool mFlashAttention;
+    bool mWarpSpecialization;
+    bool mFP32Accumulation;
+    bool mAlibiSupported;
+    bool mTiled;
+    bool mEnableAttnLogitSoftcapping;
+    bool mReturnSoftmaxStats;
+}} metaV2[] = {{
+{metadata_v2}
+}};
+}}
+
+""".format(**locals(), copyright=copyright)
+
+    return code
+
+
+# This is used to add some kernels running in cubins for passing CI cases.
+def modify_cubin_header(cubin_header):
+    result = cubin_header
+
+    # for CI cases
+    def add_kernel_line(result, target, addition):
+        pos = result.find(target)
+        if pos != -1:
+            end_pos = result.find("\n", pos)
+            if end_pos == -1:
+                end_pos = len(result)
+            result = result[: end_pos + 1] + addition + result[end_pos:]
+        return result
+
+    target = "#ifndef EXCLUDE_SM_80"
+    addition = """extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[];
+extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len;"""
+    result = add_kernel_line(result, target, addition)
+
+    def modify_kernel_line(result, target, new_line):
+        lines = result.split("\n")
+        for i, line in enumerate(lines):
+            if target in line:
+                lines[i] = new_line
+                break
+        return "\n".join(lines)
+
+    target = "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_causal_sm80_kernel_nl_tiled"
+    new_line = '{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_80, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_causal_sm80_kernel_nl_tiled", 81920, 128, 64, 1, 2, false, true, false, false, true, true, false, true, nullptr},'
+    result = modify_kernel_line(result, target, new_line)
+
+    # make sure only one empty line at the end
+    lines = result.split("\n")
+    while lines and not lines[-1].strip():
+        lines.pop()
+    lines.append("")
+
+    return "\n".join(lines)
+
+
+def generate_files(specs_names):
+    kfiles = []
+    valid_specs_names = []
+
+    for kspec, fname, lname, kname in specs_names:
+        code = get_kernel_code(kspec, kname, lname)
+        # some kernels are skipped when generating cubins for trt-llm.
+        if code is None:
+            continue
+        # add valid specs names
+        valid_specs_names.append((kspec, fname, lname, kname))
+        path = os.path.join("./generated", fname)
+        # HACK: do not overwrite kernel file in case of collision; kernel selection logic can still be flaky
+        # TODO: allow profiling multiple kernel implementations satisfying the given problem size
+        if path not in kfiles:
+            with open(path, "w") as f:
+                f.write(code)
+        kfiles.append(path)
+
+    api_code = get_api_code(valid_specs_names).replace(
+        "__guard_fmhca_placeholder__", "false"
+    )
+    with open("./generated/fused_multihead_attention_api.h", "w") as f:
+        f.write(api_code)
+
+    api_code = get_api_code(valid_specs_names).replace(
+        "__guard_fmhca_placeholder__", "true"
+    )
+    with open("./generated/fused_multihead_cross_attention_api.h", "w") as f:
+        f.write(api_code)
+
+    mk_code = get_makefile_code(valid_specs_names)
+
+    with open("./generated/makefile", "w") as f:
+        f.write(mk_code)
+
+    print_kernel_traits_code = get_kernel_traits_code(valid_specs_names)
+    with open("./generated/print_kernel_traits.cu", "w") as f:
+        f.write(print_kernel_traits_code)
+
+    # Make sure we have a bin directory.
+    if not os.path.exists("bin"):
+        os.mkdir("bin")
+    cmd = [
+        "nvcc",
+        "-I",
+        "src",
+        "-Xcompiler",
+        "-Wno-enum-compare",
+        "--std=c++17",
+        "-o",
+        "bin/print_traits.exe",
+        "generated/print_kernel_traits.cu",
+    ]
+    if "CUDA_PATH" in os.environ:
+        cmd[0] = os.environ["CUDA_PATH"] + "/bin/" + cmd[0]
+    # print('Running command "{}" to build "bin/print_traits.exe":'.format(" ".join(cmd)))
+    process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    output, error = process.communicate()
+    # print('Running "bin/print_traits.exe":')
+    process = subprocess.Popen(
+        "bin/print_traits.exe", stdin=subprocess.PIPE, stdout=subprocess.PIPE
+    )
+    output, error = process.communicate()
+    output = output.decode("utf-8").strip()
+    # this gives: kname, smem bytes, threads_per_cta, loop_step
+    kernel_traits = [traits.split() for traits in output.splitlines()]
+    cubin_header = get_cubin_header(kernel_traits, valid_specs_names)
+    if generate_cu_trtllm:
+        cubin_header = modify_cubin_header(cubin_header)
+
+    with open("./generated/fmha_cubin.h", "w") as f:
+        f.write(cubin_header)
+
+
+def enumerate_hgmma_tma_kernels(specs, sm=90):
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=90,
+            dtype="fp16",
+            seq_len=[64, 128, 256],
+            head_size=64,
+            warps_m=4,  # 4x1 warpgroups
+            warps_n=1,
+            version=2,
+            interleaved=False,
+            ldgsts_q=False,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=64,
+            has_noloop=0,
+            noloop_step=64,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+
+# Note this will be used in TRT-LLM.
+def enumerate_hgmma_ldgsts_kernels(specs, sm=90, dtype="fp16"):
+    for enable_attn_logit_softcapping in [False, True]:
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=90,
+                dtype=dtype,
+                seq_len=[64, 128, 256],
+                head_size=[32, 64],
+                warps_m=4,  # 4x1 warpgroups
+                warps_n=1,
+                version=2,
+                interleaved=False,
+                ldgsts_q=True,  # for Hopper kernels, ldgsts = False signals TMA usage.
+                ldgsts_k=True,
+                ldgsts_v=True,
+                share_smem_k_v=False,
+                loop_step=64,
+                has_noloop=1,
+                noloop_step=64,
+                unroll_threshold=1,
+                has_scale_max=False,
+                enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+            )
+        )
+
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=90,
+                dtype=dtype,
+                seq_len=[384, 512],
+                head_size=[32, 64],
+                warps_m=4,  # 4x1 warpgroups
+                warps_n=2,
+                version=2,
+                interleaved=False,
+                ldgsts_q=True,  # for Hopper kernels, ldgsts = False signals TMA usage.
+                ldgsts_k=True,
+                ldgsts_v=True,
+                share_smem_k_v=False,
+                loop_step=64,
+                has_noloop=1,
+                noloop_step=64,
+                unroll_threshold=1,
+                has_scale_max=False,
+                enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+            )
+        )
+
+
+# Note this will be used in TRT-LLM.
+def enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype="fp16"):
+    scheduling_mode = int(os.getenv("SCHEDULING_MODE", "1"))
+
+    # use specialized kernels for cases without alibi scales.
+    # there is a numeric issues when applying the exp2f scale optimization and alibi scale at the same time.
+    combinations = product(
+        [False, True],
+        [False, True],
+        [
+            InputLayout.PACKED_QKV,
+            InputLayout.CONTIGUOUS_Q_KV,
+            InputLayout.Q_PAGED_KV,
+            InputLayout.SEPARATE_Q_K_V,
+        ],
+        [False, True],
+    )
+    for (
+        alibi,
+        return_softmax,
+        input_layout,
+        enable_attn_logit_softcapping,
+    ) in combinations:
+        # alibi and enable_attn_logit_softcapping shouldn't be used together.
+        if alibi and enable_attn_logit_softcapping:
+            continue
+        # for normal attention, we only need contiguous kv as input layout when returning softmax.
+        skip_combination = (
+            return_softmax and input_layout != InputLayout.CONTIGUOUS_Q_KV
+        )
+        # for context mla, we need separate qkv as input layout when returning softmax.
+        skip_mla_combination = (
+            return_softmax and input_layout != InputLayout.SEPARATE_Q_K_V
+        )
+        if not skip_combination:
+            # only specify
+            specs.append(
+                kernel_spec(
+                    sm=sm,
+                    sm_mma=90,
+                    dtype=dtype,
+                    seq_len=0,  # support any sequence length
+                    head_size=[32, 40, 48, 64],
+                    warps_m=4,  # 4x1 warpgroups
+                    warps_n=1,
+                    version=2,
+                    interleaved=False,
+                    ldgsts_q=False,  # for Hopper kernels, ldgsts = False signals TMA usage.
+                    ldgsts_k=False,
+                    ldgsts_v=False,
+                    share_smem_k_v=False,
+                    loop_step=64,
+                    q_tile_buffers=1,  # only used by warp specialized kernels
+                    has_noloop=0,
+                    noloop_step=64,
+                    kv_loop_step=256,
+                    kv_tile_buffers=2,  # only used by warp specialized kernels
+                    unroll_threshold=1,
+                    has_scale_max=False,
+                    flash_attention=True,
+                    warp_specialization=True,
+                    alibi=alibi,
+                    enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+                    return_softmax_stats=return_softmax,
+                    scheduling_mode=scheduling_mode,
+                    input_layout=input_layout,
+                )
+            )
+
+            specs.append(
+                kernel_spec(
+                    sm=sm,
+                    sm_mma=90,
+                    dtype=dtype,
+                    seq_len=0,  # support any sequence length
+                    head_size=[72, 80, 96, 104, 128],
+                    warps_m=4,  # 4x1 warpgroups
+                    warps_n=1,
+                    version=2,
+                    interleaved=False,
+                    ldgsts_q=False,  # for Hopper kernels, ldgsts = False signals TMA usage.
+                    ldgsts_k=False,
+                    ldgsts_v=False,
+                    share_smem_k_v=False,
+                    loop_step=64,
+                    q_tile_buffers=1,  # only used by warp specialized kernels
+                    has_noloop=0,
+                    noloop_step=64,
+                    kv_loop_step=128,
+                    kv_tile_buffers=2,  # only used by warp specialized kernels
+                    unroll_threshold=1,
+                    has_scale_max=False,
+                    flash_attention=True,
+                    warp_specialization=True,
+                    alibi=alibi,
+                    enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+                    return_softmax_stats=return_softmax,
+                    scheduling_mode=scheduling_mode,
+                    input_layout=input_layout,
+                )
+            )
+
+            specs.append(
+                kernel_spec(
+                    sm=sm,
+                    sm_mma=90,
+                    dtype=dtype,
+                    seq_len=0,  # support any sequence length
+                    head_size=[160, 192, 256],
+                    warps_m=4,  # 4x1 warpgroups
+                    warps_n=1,
+                    version=2,
+                    interleaved=False,
+                    ldgsts_q=False,  # for Hopper kernels, ldgsts = False signals TMA usage.
+                    ldgsts_k=False,
+                    ldgsts_v=False,
+                    share_smem_k_v=False,
+                    loop_step=64,
+                    q_tile_buffers=1,  # only used by warp specialized kernels
+                    has_noloop=0,
+                    noloop_step=64,
+                    kv_loop_step=64,
+                    kv_tile_buffers=2,  # only used by warp specialized kernels
+                    unroll_threshold=1,
+                    has_scale_max=False,
+                    flash_attention=True,
+                    warp_specialization=True,
+                    alibi=alibi,
+                    enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+                    return_softmax_stats=return_softmax,
+                    scheduling_mode=scheduling_mode,
+                    input_layout=input_layout,
+                )
+            )
+        """
+        smem size = (q_step * d * q_buffers * NUM_COMPUTE_GROUPS
+                    + (kv_step * d + kv_step * dv) * kv_buffers) * ele_size
+        Originally, head size is padded to next_power_of_2<d> and next_power_of_2<dv>.
+        For fp16/bf16 context MLA (d=192/dv=128), d is padded to 256, and dv remains 128,
+            if kv_step=64, then smem_size = 160 KB, it is OK but wastes much smem.
+            if kv_step=128, then smem_size = 256 KB, it is too big for Hopper (228KB smem per SM).
+        But in fact, 'next multiply of 128 bytes' is needed only, due to TMA 128B swizzle mode.
+        Then for fp16/bf16 context MLA, d remains 192 (192 * 2 = 128 * 3), and dv remains 128,
+            if kv_step = 128, then smem_size = 208 KB, smem is fully utilized.
+        """
+        if not skip_mla_combination:
+            specs.append(
+                kernel_spec(
+                    sm=sm,
+                    sm_mma=90,
+                    dtype=dtype,
+                    seq_len=0,  # support any sequence length
+                    head_size=192,
+                    head_size_v=128,
+                    warps_m=4,  # 4x1 warpgroups
+                    warps_n=1,
+                    version=2,
+                    interleaved=False,
+                    ldgsts_q=False,  # for Hopper kernels, ldgsts = False signals TMA usage.
+                    ldgsts_k=False,
+                    ldgsts_v=False,
+                    share_smem_k_v=False,
+                    loop_step=64,
+                    q_tile_buffers=1,  # only used by warp specialized kernels
+                    has_noloop=0,
+                    noloop_step=64,
+                    kv_loop_step=128,
+                    kv_tile_buffers=2,  # only used by warp specialized kernels
+                    unroll_threshold=1,
+                    has_scale_max=False,
+                    flash_attention=True,
+                    warp_specialization=True,
+                    alibi=alibi,
+                    enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+                    return_softmax_stats=return_softmax,
+                    scheduling_mode=scheduling_mode,
+                    input_layout=input_layout,
+                )
+            )
+
+
+# Note this will be used in TRT-LLM.
+def enumerate_qgmma_flash_warpspec_kernels(
+    specs, sm=90, dtype="e4m3", sage_block_sizes=None, output_dtype=None
+):
+    scheduling_mode = int(os.getenv("SCHEDULING_MODE", "1"))
+
+    # use specialized kernels for cases without alibi scales.
+    # there is a numeric issues when applying the exp2f scale optimization and alibi scale at the same time.
+    combinations = product(
+        [False, True],
+        [
+            InputLayout.PACKED_QKV,
+            InputLayout.CONTIGUOUS_Q_KV,
+            InputLayout.Q_PAGED_KV,
+            InputLayout.SEPARATE_Q_K_V,
+        ],
+        [False, True],
+        [False, True],
+    )
+    for (
+        alibi,
+        input_layout,
+        enable_attn_logit_softcapping,
+        return_softmax,
+    ) in combinations:
+        # alibi and bmm1_tanh_scale shouldn't be used together.
+        if alibi and enable_attn_logit_softcapping:
+            continue
+        # for normal attention, we do not need return softmax for ws fp8 kernels currently.
+        # also fp8 input and bf16 output is only needed for MLA kernel.
+        skip_combination = return_softmax or (output_dtype is not None)
+        # for context mla, we need separate qkv as input layout when returning softmax.
+        skip_mla_combination = (
+            return_softmax and input_layout != InputLayout.SEPARATE_Q_K_V
+        )
+        if not skip_combination:
+            # D <= 64: KV_STEP = 256
+            specs.append(
+                kernel_spec(
+                    sm=sm,
+                    sm_mma=90,
+                    dtype=dtype,
+                    seq_len=0,  # support any sequence length
+                    head_size=[32, 40, 48, 64],
+                    warps_m=4,  # 4x1 warpgroups
+                    warps_n=1,
+                    version=2,
+                    interleaved=False,
+                    ldgsts_q=False,  # for Hopper kernels, ldgsts = False signals TMA usage.
+                    ldgsts_k=False,
+                    ldgsts_v=False,
+                    share_smem_k_v=False,
+                    loop_step=64,
+                    q_tile_buffers=1,  # only used by warp specialized kernels
+                    has_noloop=0,
+                    noloop_step=64,
+                    kv_loop_step=256,
+                    kv_tile_buffers=4,  # only used by warp specialized kernels
+                    unroll_threshold=1,
+                    has_scale_max=False,
+                    flash_attention=True,
+                    warp_specialization=True,
+                    alibi=alibi,
+                    enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+                    return_softmax_stats=return_softmax,
+                    scheduling_mode=scheduling_mode,
+                    input_layout=input_layout,
+                    sage_block_sizes=sage_block_sizes,
+                    output_dtype=output_dtype,
+                )
+            )
+
+            # 64 < D <=128: KV_STEP = 128
+            specs.append(
+                kernel_spec(
+                    sm=sm,
+                    sm_mma=90,
+                    dtype=dtype,
+                    seq_len=0,  # support any sequence length
+                    head_size=[80, 96, 104, 128],
+                    warps_m=4,  # 4x1 warpgroups
+                    warps_n=1,
+                    version=2,
+                    interleaved=False,
+                    ldgsts_q=False,  # for Hopper kernels, ldgsts = False signals TMA usage.
+                    ldgsts_k=False,
+                    ldgsts_v=False,
+                    share_smem_k_v=False,
+                    loop_step=64,
+                    q_tile_buffers=1,  # only used by warp specialized kernels
+                    has_noloop=0,
+                    noloop_step=64,
+                    kv_loop_step=256,
+                    kv_tile_buffers=2,  # only used by warp specialized kernels
+                    unroll_threshold=1,
+                    has_scale_max=False,
+                    flash_attention=True,
+                    warp_specialization=True,
+                    alibi=alibi,
+                    enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+                    return_softmax_stats=return_softmax,
+                    scheduling_mode=scheduling_mode,
+                    input_layout=input_layout,
+                    sage_block_sizes=sage_block_sizes,
+                    output_dtype=output_dtype,
+                )
+            )
+
+            # 128 < D <=256: KV_STEP = 128
+            specs.append(
+                kernel_spec(
+                    sm=sm,
+                    sm_mma=90,
+                    dtype=dtype,
+                    seq_len=0,  # support any sequence length
+                    head_size=[160, 192, 256],
+                    warps_m=4,  # 4x1 warpgroups
+                    warps_n=1,
+                    version=2,
+                    interleaved=False,
+                    ldgsts_q=False,  # for Hopper kernels, ldgsts = False signals TMA usage.
+                    ldgsts_k=False,
+                    ldgsts_v=False,
+                    share_smem_k_v=False,
+                    loop_step=64,
+                    q_tile_buffers=1,  # only used by warp specialized kernels
+                    has_noloop=0,
+                    noloop_step=64,
+                    kv_loop_step=128,  # use 128 kv step size to avoid register spilling
+                    kv_tile_buffers=2,  # only used by warp specialized kernels
+                    unroll_threshold=1,
+                    has_scale_max=False,
+                    flash_attention=True,
+                    warp_specialization=True,
+                    alibi=alibi,
+                    enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+                    return_softmax_stats=return_softmax,
+                    scheduling_mode=scheduling_mode,
+                    input_layout=input_layout,
+                    sage_block_sizes=sage_block_sizes,
+                    output_dtype=output_dtype,
+                )
+            )
+
+        if not skip_mla_combination:
+            # context MLA (192x128)
+            specs.append(
+                kernel_spec(
+                    sm=sm,
+                    sm_mma=90,
+                    dtype=dtype,
+                    seq_len=0,  # support any sequence length
+                    head_size=192,
+                    head_size_v=128,
+                    warps_m=4,  # 4x1 warpgroups
+                    warps_n=1,
+                    version=2,
+                    interleaved=False,
+                    ldgsts_q=False,  # for Hopper kernels, ldgsts = False signals TMA usage.
+                    ldgsts_k=False,
+                    ldgsts_v=False,
+                    share_smem_k_v=False,
+                    loop_step=64,
+                    q_tile_buffers=1,  # only used by warp specialized kernels
+                    has_noloop=0,
+                    noloop_step=64,
+                    kv_loop_step=128,
+                    kv_tile_buffers=2,  # only used by warp specialized kernels
+                    unroll_threshold=1,
+                    has_scale_max=False,
+                    flash_attention=True,
+                    warp_specialization=True,
+                    alibi=alibi,
+                    enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+                    return_softmax_stats=return_softmax,
+                    scheduling_mode=scheduling_mode,
+                    input_layout=input_layout,
+                    sage_block_sizes=sage_block_sizes,
+                    output_dtype=output_dtype,
+                )
+            )
+
+
+def enumerate_igmma_kernels(specs, sm=90):
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=90,
+            dtype="int8",
+            seq_len=[64, 128, 256, 384],
+            head_size=64,
+            warps_m=4,  # 4x1 warpgroups
+            warps_n=1,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,  # for Hopper kernels, ldgsts = False signals TMA usage.
+            ldgsts_k=True,
+            ldgsts_v=True,
+            share_smem_k_v=False,
+            loop_step=64,
+            has_noloop=1,
+            noloop_step=64,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=90,
+            dtype="int8",
+            seq_len=[512],
+            head_size=64,
+            warps_m=4,  # 4x2 warpgroups
+            warps_n=2,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,  # for Hopper kernels, ldgsts = False signals TMA usage.
+            ldgsts_k=True,
+            ldgsts_v=True,
+            share_smem_k_v=False,
+            loop_step=64,
+            has_noloop=1,
+            noloop_step=64,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+
+def enumerate_hmma_kernels(specs, sm=80, dtype="fp16"):
+    # The following kernels are hmma-based kernels tuned for sm90
+    if sm == 90:
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=80,
+                dtype=dtype,
+                seq_len=[64, 128, 256],
+                head_size=[64, 72],
+                warps_m=1,
+                warps_n=4,
+                version=2,
+                interleaved=False,
+                ldgsts_q=True,
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=False,
+                loop_step=32,
+                has_noloop=1,
+                noloop_step=32,
+                unroll_threshold=1,
+                has_scale_max=False,
+            )
+        )
+
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=80,
+                dtype=dtype,
+                seq_len=[384, 512],
+                head_size=[64, 72],
+                warps_m=1,
+                warps_n=8,
+                version=2,
+                interleaved=False,
+                ldgsts_q=True,
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=False,
+                loop_step=32,
+                has_noloop=1,
+                noloop_step=32,
+                unroll_threshold=1,
+                has_scale_max=False,
+            )
+        )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=86,
+            dtype=dtype,
+            seq_len=384,
+            head_size=64,
+            warps_m=1,
+            warps_n=8,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=64,
+            has_noloop=1,
+            noloop_step=64,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=86,
+            dtype="fp16",
+            seq_len=384,
+            head_size=64,
+            warps_m=1,
+            warps_n=8,
+            version=1,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=64,
+            has_noloop=1,
+            noloop_step=64,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    #  S=1024 split over 4 CTAs.
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="fp16",
+            seq_len=256,
+            head_size=[32, 64],
+            warps_m=1,
+            warps_n=4,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=32,
+            has_noloop=0,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+            ctas_per_head=4,
+        )
+    )
+
+    # - S=512: STEP=32, STEP NL=-- FLAGS=0x9 (0x9 for SM86!)
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="fp16",
+            seq_len=512,
+            head_size=64,
+            warps_m=1,
+            warps_n=8,
+            version=1,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype=dtype,
+            seq_len=512,
+            head_size=[16, 32, 64],
+            warps_m=1,
+            warps_n=8,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="fp16",
+            seq_len=512,
+            head_size=[16, 32, 64],
+            warps_m=1,
+            warps_n=8,
+            version=1,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    # - S=384: STEP=48, STEP NL=-- FLAGS=0x9 (0x9 for SM86!)
+    #  TODO warps_n=4 leads to 2 pred regs, which is not supported
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="fp16",
+            seq_len=384,
+            head_size=64,
+            warps_m=1,
+            warps_n=8,
+            version=1,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=48,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype=dtype,
+            seq_len=384,
+            head_size=64,
+            warps_m=1,
+            warps_n=8,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=48,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    # -  S=256: STEP=32, STEP NL=32 FLAGS=0x1
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="fp16",
+            seq_len=256,
+            head_size=[16, 32, 64],
+            warps_m=1,
+            warps_n=4,
+            version=1,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype=dtype,
+            seq_len=256,
+            head_size=[16, 32, 64],
+            warps_m=1,
+            warps_n=4,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    # #-  S=128: STEP=NA, STEP NL=32 FLAGS=0x1
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="fp16",
+            seq_len=128,
+            head_size=[16, 32, 64],
+            warps_m=2,
+            warps_n=2,
+            version=1,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=128,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype=dtype,
+            seq_len=128,
+            head_size=[16, 32, 64],
+            warps_m=2,
+            warps_n=2,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=128,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    # -  S=96:  STEP=32, STEP NL=-- FLAGS=0x1 TODO noloop does not work - illegal memory access: we run LDSM.T x4 which is oob.
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="fp16",
+            seq_len=96,
+            head_size=64,
+            warps_m=2,
+            warps_n=2,
+            version=1,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=96,
+            has_noloop=0,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype=dtype,
+            seq_len=96,
+            head_size=64,
+            warps_m=2,
+            warps_n=2,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=96,
+            has_noloop=0,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    # -  S=64:  STEP=32, STEP NL=-- FLAGS=0x1
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="fp16",
+            seq_len=64,
+            head_size=64,
+            warps_m=2,
+            warps_n=2,
+            version=1,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=64,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype=dtype,
+            seq_len=64,
+            head_size=64,
+            warps_m=2,
+            warps_n=2,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=64,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    if sm == 75:
+        # - FP16
+        # - S=512: STEP=32, STEP NL=-- FLAGS=0x9 (0x9 for SM86!)
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=75,
+                dtype="fp16",
+                seq_len=[384, 512],
+                head_size=[16, 32, 64],
+                warps_m=1,
+                warps_n=8,
+                version=1,
+                interleaved=False,
+                ldgsts_q=False,
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=True,
+                loop_step=32,
+                has_noloop=1,
+                noloop_step=32,
+                unroll_threshold=1,
+                has_scale_max=False,
+            )
+        )
+
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=75,
+                dtype="fp16",
+                seq_len=[384, 512],
+                head_size=[16, 32, 64],
+                warps_m=1,
+                warps_n=8,
+                version=2,
+                interleaved=False,
+                ldgsts_q=False,
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=True,
+                loop_step=32,
+                has_noloop=1,
+                noloop_step=32,
+                unroll_threshold=1,
+                has_scale_max=False,
+            )
+        )
+
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=75,
+                dtype="fp16",
+                seq_len=256,
+                head_size=[16, 32, 64],
+                warps_m=1,
+                warps_n=4,
+                version=1,
+                interleaved=False,
+                ldgsts_q=False,
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=True,
+                loop_step=32,
+                has_noloop=1,
+                noloop_step=32,
+                unroll_threshold=1,
+                has_scale_max=False,
+            )
+        )
+
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=75,
+                dtype="fp16",
+                seq_len=256,
+                head_size=[16, 32, 64],
+                warps_m=1,
+                warps_n=4,
+                version=2,
+                interleaved=False,
+                ldgsts_q=False,
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=True,
+                loop_step=32,
+                has_noloop=1,
+                noloop_step=32,
+                unroll_threshold=1,
+                has_scale_max=False,
+            )
+        )
+
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=75,
+                dtype="fp16",
+                seq_len=128,
+                head_size=[16, 32, 64],
+                warps_m=2,
+                warps_n=2,
+                version=1,
+                interleaved=False,
+                ldgsts_q=False,
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=True,
+                loop_step=128,
+                has_noloop=1,
+                noloop_step=32,
+                unroll_threshold=1,
+                has_scale_max=False,
+            )
+        )
+
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=75,
+                dtype="fp16",
+                seq_len=128,
+                head_size=[16, 32, 64],
+                warps_m=2,
+                warps_n=2,
+                version=2,
+                interleaved=False,
+                ldgsts_q=False,
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=True,
+                loop_step=128,
+                has_noloop=1,
+                noloop_step=32,
+                unroll_threshold=1,
+                has_scale_max=False,
+            )
+        )
+
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=75,
+                dtype="fp16",
+                seq_len=64,
+                head_size=[16, 32, 64],
+                warps_m=2,
+                warps_n=2,
+                version=2,
+                interleaved=False,
+                ldgsts_q=False,
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=True,
+                loop_step=64,
+                has_noloop=1,
+                noloop_step=32,
+                unroll_threshold=1,
+                has_scale_max=False,
+            )
+        )
+
+    # -  S=384: STEP=32, STEP NL=32 FLAGS=0x8
+    # -  S=256: STEP=32, STEP NL=32 FLAGS=0x8
+    # -  S=128: STEP=32, STEP NL=32 FLAGS=0x8
+    # -  S=128: STEP=NA, STEP NL=32 FLAGS=0x8
+    # -  S=96:  STEP=32, STEP NL=-- FLAGS=0x8
+    # -  S=64:  STEP=32, STEP NL=-- FLAGS=0x8
+
+    # SM 72
+    # - Int8 (same for interleaved)
+    # -  S=384: STEP=32, STEP NL=-- FLAGS=0x0
+    # -  S=256: STEP=64, STEP NL=-- FLAGS=0x0
+    # -  S=192: STEP=64, STEP NL=-- FLAGS=0x0
+    # -  S=128: STEP=NA, STEP NL=-- FLAGS=0x8
+    # -  S=96
+    # -  S=64
+
+
+def enumerate_hmma884_kernels(specs, sm=70):
+    # - FP16
+    # - S=512: STEP=32, STEP NL=-- FLAGS=0x9 (0x9 for SM86!)
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=70,
+            dtype="fp16",
+            seq_len=[384, 512],
+            head_size=[64],
+            warps_m=1,
+            warps_n=8,
+            version=1,
+            interleaved=False,
+            ldgsts_q=False,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=16,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=70,
+            dtype="fp16",
+            seq_len=[384, 512],
+            head_size=[64],
+            warps_m=1,
+            warps_n=8,
+            version=2,
+            interleaved=False,
+            ldgsts_q=False,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=16,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=70,
+            dtype="fp16",
+            seq_len=[384, 512],
+            head_size=[32],
+            warps_m=1,
+            warps_n=8,
+            version=1,
+            interleaved=False,
+            ldgsts_q=False,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=70,
+            dtype="fp16",
+            seq_len=[384, 512],
+            head_size=[32],
+            warps_m=1,
+            warps_n=8,
+            version=2,
+            interleaved=False,
+            ldgsts_q=False,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    # -  S=256: STEP=32, STEP NL=32 FLAGS=0x8
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=70,
+            dtype="fp16",
+            seq_len=[128, 256],
+            head_size=[32, 64],
+            warps_m=1,
+            warps_n=8,
+            version=1,
+            interleaved=False,
+            ldgsts_q=False,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=70,
+            dtype="fp16",
+            seq_len=[128, 256],
+            head_size=[32, 64],
+            warps_m=1,
+            warps_n=8,
+            version=2,
+            interleaved=False,
+            ldgsts_q=False,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    # SEQLEN 96
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=70,
+            dtype="fp16",
+            seq_len=96,
+            head_size=64,
+            warps_m=2,
+            warps_n=2,
+            version=2,
+            interleaved=False,
+            ldgsts_q=False,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=128,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    # SEQLEN 64
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=70,
+            dtype="fp16",
+            seq_len=64,
+            head_size=64,
+            warps_m=2,
+            warps_n=2,
+            version=2,
+            interleaved=False,
+            ldgsts_q=False,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=128,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    # SEQLEN 32
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=70,
+            dtype="fp16",
+            seq_len=32,
+            head_size=64,
+            warps_m=2,
+            warps_n=2,
+            version=2,
+            interleaved=False,
+            ldgsts_q=False,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=128,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+
+def enumerate_hmma_paged_kv_flash_kernels(specs, sm=80, dtype="fp16"):
+    for enable_attn_logit_softcapping in [False, True]:
+        enumerate_hmma_flash_kernels_base(
+            specs, sm, dtype, InputLayout.PACKED_QKV, enable_attn_logit_softcapping
+        )
+
+
+def enumerate_hmma_flash_kernels(specs, sm=80, dtype="fp16", head_size_v=0):
+    input_layouts = [
+        InputLayout.PACKED_QKV,
+        InputLayout.CONTIGUOUS_Q_KV,
+        InputLayout.Q_PAGED_KV,
+    ]
+    # Deepseek MLA (context 192/128 separate-q-k-v)
+    if head_size_v == 128:
+        input_layouts.append(InputLayout.SEPARATE_Q_K_V)
+    for input_layout, enable_attn_logit_softcapping in product(
+        input_layouts, [False, True]
+    ):
+        enumerate_hmma_flash_kernels_base(
+            specs, sm, dtype, input_layout, enable_attn_logit_softcapping, head_size_v
+        )
+
+
+# Note this will be used in TRT-LLM.
+def enumerate_hmma_flash_kernels_base(
+    specs,
+    sm=80,
+    dtype="fp16",
+    input_layout=InputLayout.PACKED_QKV,
+    enable_attn_logit_softcapping=False,
+    head_size_v=0,
+):
+    # - FP16 Flash Attention (use nl as default)
+    # Any Sequence Length H = 16/32/40/48/64/80/128/160/256/512 flash attention
+
+    # Note: sm70, sm72 are based on hmma8x8x4, while sm75+ is based on hmma16x8x16
+    # sm75 and sm80+ use the same underlying trait class; but for historical reasons we prefer not
+    # to change the appearance of the trait class. So:
+    #  - Volta uses Volta_hmma_fp16_traits
+    #  - Turing uses Turing_hmma_fp16_traits
+    #  - Ampere uses Ampere_hmma_fp16_traits but is effectively an alias of Turing_hmma_fp16_traits
+    #  - Ada and Hopper use Ampere_hmma_fp16_traits
+    sm_mma = 0
+    if sm in [70, 72]:
+        sm_mma = 70
+    elif sm in [75]:
+        sm_mma = 75
+    elif sm in [80, 86, 87, 89, 90, 100, 120]:
+        sm_mma = 80
+
+    # _nl_tiled kernels; higher precedence than _nl kernels
+    # params[head_size] = [q_step, kv_step]
+    tiled_params_q_kv_step = {
+        16: [128, 128],
+        32: [128, 128],
+        40: [128, 128],
+        48: [128, 128],
+        64: [128, 128],
+        72: [64, 128],
+        80: [64, 128],
+        96: [64, 128],
+        104: [64, 128],
+        128: [64, 128],
+        160: [64, 128],
+        192: [64, 128],
+        256: [64, 128],
+        512: [64, 64],
+        576: [64, 64],
+    }
+    for head_size, [q_loop_step, kv_loop_step] in tiled_params_q_kv_step.items():
+        if sm_mma == 80:
+            specs.append(
+                kernel_spec(
+                    sm=sm,
+                    sm_mma=sm_mma,
+                    dtype=dtype,
+                    flash_attention=True,
+                    tiled=1,
+                    seq_len=0,  # means any sequence here
+                    kv_loop_step=kv_loop_step,
+                    limit_qk_fragments=False,
+                    limit_v_fragments=False,
+                    head_size=head_size,
+                    head_size_v=head_size_v,
+                    warps_m=4,
+                    warps_n=1,
+                    version=2,
+                    interleaved=False,
+                    ldgsts_q=True,
+                    ldgsts_k=True,
+                    ldgsts_v=True,
+                    share_smem_k_v=False,
+                    loop_step=q_loop_step,
+                    has_noloop=1,
+                    noloop_step=q_loop_step,
+                    unroll_threshold=1,
+                    has_scale_max=False,
+                    ctas_per_head=1,
+                    input_layout=input_layout,
+                    enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+                    is_mtp=(head_size == 576 and head_size_v == 512),
+                )
+            )
+
+    for head_size in [16, 32, 40, 48, 64, 72, 80, 96, 104, 128, 160, 192, 256, 512]:
+        if sm == 70 and (head_size > 256 or head_size == 16):
+            continue
+        # TODO: test head_size=512 on sm75
+        if sm == 75 and head_size > 256:
+            continue
+
+        # tune ldgsts
+        ldgsts_q = True
+        ldgsts_k = True
+        ldgsts_v = True
+        if head_size >= 256:
+            ldgsts_k = False
+            ldgsts_v = False
+        if head_size > 256:
+            ldgsts_q = False
+        if sm < 80:
+            ldgsts_q = False
+            ldgsts_k = False
+            ldgsts_v = False
+
+        # tune kv fragment double buffer
+        limit_qk_fragments = False
+        limit_v_fragments = False
+        if head_size >= 256 or head_size >= 128 and sm == 70:
+            limit_qk_fragments = True
+            limit_v_fragments = True
+
+        # tune kv_loop step
+        q_loop_step = 64
+        kv_loop_step = 64
+        if head_size > 128 or head_size > 64 and sm == 70:
+            kv_loop_step = 16
+        elif head_size > 32:
+            kv_loop_step = 32
+
+        if sm < 80 or head_size > 128:
+            specs.append(
+                kernel_spec(
+                    sm=sm,
+                    sm_mma=sm_mma,
+                    dtype=dtype,
+                    flash_attention=True,
+                    seq_len=0,  # means any sequence here
+                    kv_loop_step=kv_loop_step,
+                    limit_qk_fragments=limit_qk_fragments,
+                    limit_v_fragments=limit_v_fragments,
+                    head_size=head_size,
+                    head_size_v=head_size_v,
+                    warps_m=4,
+                    warps_n=1,
+                    version=2,
+                    interleaved=False,
+                    ldgsts_q=ldgsts_q,
+                    ldgsts_k=ldgsts_k,
+                    ldgsts_v=ldgsts_v,
+                    share_smem_k_v=False,
+                    loop_step=q_loop_step,
+                    has_noloop=1,
+                    noloop_step=q_loop_step,
+                    unroll_threshold=1,
+                    has_scale_max=False,
+                    ctas_per_head=1,
+                    input_layout=input_layout,
+                    enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+                )
+            )
+        elif head_size <= 128:
+            # q_step = 64, kv_step = 32
+            specs.append(
+                kernel_spec(
+                    sm=sm,
+                    sm_mma=sm_mma,
+                    dtype=dtype,
+                    flash_attention=True,
+                    seq_len=0,  # means any sequence here
+                    kv_loop_step=kv_loop_step,
+                    limit_qk_fragments=limit_qk_fragments,
+                    limit_v_fragments=limit_v_fragments,
+                    head_size=head_size,
+                    head_size_v=head_size_v,
+                    warps_m=4,
+                    warps_n=1,
+                    version=2,
+                    interleaved=False,
+                    ldgsts_q=ldgsts_q,
+                    ldgsts_k=ldgsts_k,
+                    ldgsts_v=ldgsts_v,
+                    share_smem_k_v=False,
+                    loop_step=q_loop_step,
+                    has_noloop=1,
+                    noloop_step=q_loop_step,
+                    unroll_threshold=1,
+                    has_scale_max=False,
+                    ctas_per_head=1,
+                    input_layout=input_layout,
+                    enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+                )
+            )
+
+
+def enumerate_qgmma_kernels(specs, sm=90):
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=90,
+            dtype="e4m3",
+            seq_len=[64, 128, 192, 256, 384],
+            head_size=64,
+            warps_m=4,  # 4x1 warpgroups
+            warps_n=1,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,  # for Hopper kernels, ldgsts = False signals TMA usage.
+            ldgsts_k=True,
+            ldgsts_v=True,
+            share_smem_k_v=False,
+            loop_step=64,
+            has_noloop=1,
+            noloop_step=64,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=90,
+            dtype="e4m3",
+            seq_len=[512],
+            head_size=64,
+            warps_m=4,  # 4x2 warpgroups
+            warps_n=2,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,  # for Hopper kernels, ldgsts = False signals TMA usage.
+            ldgsts_k=True,
+            ldgsts_v=True,
+            share_smem_k_v=False,
+            loop_step=64,
+            has_noloop=1,
+            noloop_step=64,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+
+def enumerate_qmma_kernels(specs, sm=89):
+    # SM89 (Ada) fp8
+    # Head Size 64
+
+    # generate fp16 acc first
+    # NOTE: generate only one acc type if it is used for cubin loading
+    #       or modify the TestMetaV2 to have acc_type
+    for dtype in ["e4m3_fp16", "e4m3_fp32"]:
+        # SEQ 64
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=89,
+                dtype=dtype,
+                seq_len=64,
+                head_size=64,
+                warps_m=2,
+                warps_n=2,
+                version=2,
+                interleaved=False,
+                ldgsts_q=True,
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=False,
+                loop_step=64,
+                has_noloop=0,
+                noloop_step=16,
+                unroll_threshold=1,
+                has_scale_max=False,
+            )
+        )
+
+        # SEQ 96
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=89,
+                dtype=dtype,
+                seq_len=96,
+                head_size=64,
+                warps_m=2,
+                warps_n=2,
+                version=2,
+                interleaved=False,
+                ldgsts_q=True,
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=False,
+                loop_step=96,
+                has_noloop=1,
+                noloop_step=16,
+                unroll_threshold=1,
+                has_scale_max=False,
+            )
+        )
+
+        # SEQ 128
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=89,
+                dtype=dtype,
+                seq_len=128,
+                head_size=64,
+                warps_m=2,
+                warps_n=2,
+                version=2,
+                interleaved=False,
+                ldgsts_q=True,
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=False,
+                loop_step=128,
+                has_noloop=1,
+                noloop_step=16,
+                unroll_threshold=1,
+                has_scale_max=False,
+            )
+        )
+
+        # SEQ 192/256/384
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=89,
+                dtype=dtype,
+                seq_len=[192, 256, 384],
+                head_size=64,
+                warps_m=1,
+                warps_n=4,
+                version=2,
+                interleaved=False,
+                ldgsts_q=True,
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=False,
+                loop_step=32,
+                has_noloop=1,
+                noloop_step=32,
+                unroll_threshold=1,
+                has_scale_max=False,
+            )
+        )
+
+        # SEQ 512
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=89,
+                dtype=dtype,
+                seq_len=512,
+                head_size=64,
+                warps_m=1,
+                warps_n=8,
+                version=2,
+                interleaved=False,
+                ldgsts_q=True,
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=False,
+                loop_step=32,
+                has_noloop=1,
+                noloop_step=32,
+                unroll_threshold=1,
+                has_scale_max=False,
+            )
+        )
+
+
+def enumerate_qmma_flash_kernels(
+    specs,
+    sm=89,
+    dtype="e4m3_fp32",
+    head_sizes=None,
+    sage_block_sizes=None,
+    output_dtype=None,
+):
+    # ((head_size, head_size_v), (q_loop_step, kv_loop_step), tiled).
+    params_q_kv_step = [
+        (32, (128, 128), 0),
+        (40, (128, 128), 0),
+        (48, (128, 128), 0),
+        (64, (128, 128), 0),
+        (72, (64, 32), 0),
+        (80, (64, 32), 0),
+        (96, (64, 32), 0),
+        (104, (64, 32), 0),
+        (128, (64, 32), 0),
+        (160, (64, 32), 0),
+        (192, (64, 32), 0),
+        (256, (64, 32), 0),
+        # MLA kernels.
+        ((192, 128), (64, 64), 1),
+        ((576, 512), (64, 64), 1),
+    ]
+    input_layouts = [
+        InputLayout.PACKED_QKV,
+        InputLayout.CONTIGUOUS_Q_KV,
+        InputLayout.Q_PAGED_KV,
+        InputLayout.SEPARATE_Q_K_V,
+    ]
+    for (head_size_params, (q_loop_step, kv_loop_step), tiled), input_layout in product(
+        params_q_kv_step, input_layouts
+    ):
+        # head_size_v = 0 means head_size_v is the same as head_size
+        if isinstance(head_size_params, tuple):
+            head_size = head_size_params[0]
+            head_size_v = head_size_params[1]
+        else:
+            head_size = head_size_params
+            head_size_v = 0
+        # skip if head_size is not in head_sizes
+        if head_sizes is not None and head_size not in head_sizes:
+            continue
+        # skip if head_size_v is not 128 for separate-q-k-v
+        if input_layout == InputLayout.SEPARATE_Q_K_V and head_size_v != 128:
+            continue
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=89,
+                dtype=dtype,
+                seq_len=0,
+                head_size=head_size,
+                head_size_v=head_size_v,
+                warps_m=4,
+                warps_n=1,
+                version=2,
+                interleaved=False,
+                ldgsts_q=True,
+                ldgsts_k=True,
+                ldgsts_v=True,
+                share_smem_k_v=False,
+                loop_step=q_loop_step,
+                has_noloop=1,
+                noloop_step=q_loop_step,
+                kv_loop_step=kv_loop_step,
+                tiled=tiled,
+                unroll_threshold=1,
+                has_scale_max=False,
+                flash_attention=True,
+                limit_qk_fragments=False,
+                limit_v_fragments=False,
+                ctas_per_head=1,
+                input_layout=input_layout,
+                sage_block_sizes=sage_block_sizes,
+                output_dtype=output_dtype,
+                is_mtp=(head_size == 576 and head_size_v == 512),
+            )
+        )
+
+
+def enumerate_imma_kernels(specs, sm=80):
+    if sm == 90:
+        # The following kernels are imma-based kernels tuned for sm90
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=80,
+                dtype="int8",
+                seq_len=[64, 128, 256],
+                head_size=64,
+                warps_m=1,
+                warps_n=4,
+                version=2,
+                interleaved=False,
+                ldgsts_q=True,
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=False,
+                loop_step=32,
+                has_noloop=1,
+                noloop_step=32,
+                unroll_threshold=1,
+                has_scale_max=False,
+            )
+        )
+
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=80,
+                dtype="int8",
+                seq_len=[384, 512],
+                head_size=64,
+                warps_m=1,
+                warps_n=8,
+                version=2,
+                interleaved=False,
+                ldgsts_q=True,
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=False,
+                loop_step=32,
+                has_noloop=1,
+                noloop_step=32,
+                unroll_threshold=1,
+                has_scale_max=False,
+            )
+        )
+
+    # # SM 80 / 86
+    # #- Int8 (same for interleaved)
+
+    # -  S=1024 split over 4 CTAs.
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="int8",
+            seq_len=256,
+            head_size=[32, 64],
+            warps_m=1,
+            warps_n=4,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=32,
+            has_noloop=0,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+            ctas_per_head=4,
+        )
+    )
+
+    # -  S=512: STEP=32, STEP NL=32 FLAGS=0x1
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="int8",
+            seq_len=512,
+            head_size=[32, 64],
+            warps_m=1,
+            warps_n=8,
+            version=1,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="int8",
+            seq_len=512,
+            head_size=[32, 64],
+            warps_m=1,
+            warps_n=8,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    # D=16: currently needs to run with Turing traits due to K=16 for BMM1.
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=75,
+            dtype="int8",
+            seq_len=512,
+            head_size=16,
+            warps_m=1,
+            warps_n=8,
+            version=1,
+            interleaved=False,
+            ldgsts_q=sm >= 80,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    # D=16: currently needs to run with Turing traits due to K=16 for BMM1.
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=75,
+            dtype="int8",
+            seq_len=512,
+            head_size=16,
+            warps_m=1,
+            warps_n=8,
+            version=2,
+            interleaved=False,
+            ldgsts_q=sm >= 80,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    # -  S=384: STEP=32, STEP NL=32 FLAGS=0x1
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="int8",
+            seq_len=384,
+            head_size=64,
+            warps_m=1,
+            warps_n=8,  # required by pred packing.
+            version=1,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="int8",
+            seq_len=[192, 256],
+            head_size=64,
+            warps_m=1,
+            warps_n=4,
+            version=1,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="int8",
+            seq_len=[192, 256, 384],
+            head_size=64,
+            warps_m=1,
+            warps_n=4,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="int8",
+            seq_len=[192, 256, 384],
+            head_size=64,
+            warps_m=1,
+            warps_n=4,
+            version=2,
+            interleaved=True,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    # -  S=256: STEP=32, STEP NL=32 FLAGS=0x1
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="int8",
+            seq_len=256,
+            head_size=32,
+            warps_m=1,
+            warps_n=4,
+            version=1,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="int8",
+            seq_len=256,
+            head_size=32,
+            warps_m=1,
+            warps_n=4,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=75,
+            dtype="int8",
+            seq_len=256,
+            head_size=16,
+            warps_m=1,
+            warps_n=4,
+            version=1,
+            interleaved=False,
+            ldgsts_q=sm >= 80,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=75,
+            dtype="int8",
+            seq_len=256,
+            head_size=16,
+            warps_m=1,
+            warps_n=4,
+            version=2,
+            interleaved=False,
+            ldgsts_q=sm >= 80,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    #  S=192: STEP=64, STEP NL=32 FLAGS=0x1
+    # -  S=128: STEP=NA, STEP NL=16 FLAGS=0x1
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="int8",
+            seq_len=128,
+            head_size=[32, 64],
+            warps_m=1,
+            warps_n=4,
+            version=1,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=64,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=75,
+            dtype="int8",
+            seq_len=128,
+            head_size=16,
+            warps_m=2,
+            warps_n=2,
+            version=1,
+            interleaved=False,
+            ldgsts_q=sm >= 80,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=128,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="int8",
+            seq_len=128,
+            head_size=[32, 64],
+            warps_m=2,
+            warps_n=2,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=128,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=75,
+            dtype="int8",
+            seq_len=128,
+            head_size=16,
+            warps_m=2,
+            warps_n=2,
+            version=2,
+            interleaved=False,
+            ldgsts_q=sm >= 80,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=128,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="int8",
+            seq_len=128,
+            head_size=[32, 64],
+            warps_m=2,
+            warps_n=2,
+            version=2,
+            interleaved=True,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=128,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    # -  S=96
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="int8",
+            seq_len=96,
+            head_size=64,
+            warps_m=2,
+            warps_n=2,
+            version=1,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=96,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="int8",
+            seq_len=96,
+            head_size=64,
+            warps_m=2,
+            warps_n=2,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=96,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="int8",
+            seq_len=96,
+            head_size=64,
+            warps_m=2,
+            warps_n=2,
+            version=2,
+            interleaved=True,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=96,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    # #-  S=64:
+    # TODO noloop doesn't work - need to adjust packing into registers for
+    # Mma_tile_p::MMAS_N == 1 => Mma_tile_o::MMAS_K == 1 (at least on SM8x)
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="int8",
+            seq_len=64,
+            head_size=64,
+            warps_m=2,
+            warps_n=2,
+            version=1,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=64,
+            has_noloop=0,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="int8",
+            seq_len=64,
+            head_size=64,
+            warps_m=2,
+            warps_n=2,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=64,
+            has_noloop=0,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=80,
+            dtype="int8",
+            seq_len=64,
+            head_size=64,
+            warps_m=2,
+            warps_n=2,
+            version=2,
+            interleaved=True,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=64,
+            has_noloop=0,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    # This config compiles IMMA 1x4 kernels for SM90
+    # specs.append(kernel_spec(sm=90,
+    #    sm_mma=80,
+    #    dtype='int8',
+    #    seq_len=[128,192,256, 384],
+    #    head_size=64,
+    #    warps_m=1,
+    #    warps_n=4,
+    #    version=2,
+    #    interleaved=False,
+    #    ldgsts_q=True,
+    #    ldgsts_k=False,
+    #    ldgsts_v=False,
+    #    share_smem_k_v=False,
+    #    loop_step=32,
+    #    has_noloop=0,
+    #    noloop_step=32,
+    #    unroll_threshold=1,
+    #    has_scale_max=False))
+
+    # - Int8 (same for interleaved)
+    # -  S=512: STEP=32, STEP NL=32 FLAGS=0x1
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=75,
+            dtype="int8",
+            seq_len=[384, 512],
+            head_size=[32, 64],
+            warps_m=1,
+            warps_n=8,
+            version=1,
+            interleaved=False,
+            ldgsts_q=False,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=16,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=75,
+            dtype="int8",
+            seq_len=256,
+            head_size=[32, 64],
+            warps_m=1,
+            warps_n=4,
+            version=1,
+            interleaved=False,
+            ldgsts_q=False,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=75,
+            dtype="int8",
+            seq_len=512,
+            head_size=[32, 64],
+            warps_m=1,
+            warps_n=8,
+            version=2,
+            interleaved=False,
+            ldgsts_q=False,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=75,
+            dtype="int8",
+            seq_len=[192, 256, 384],
+            head_size=[32, 64],
+            warps_m=1,
+            warps_n=4,
+            version=2,
+            interleaved=False,
+            ldgsts_q=False,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=16,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    # -  S=384: STEP=32, STEP NL=32 FLAGS=0x0
+    # -  S=256: STEP=32, STEP NL=32 FLAGS=0x0
+    # -  S=128: STEP=32, STEP NL=32 FLAGS=0x0
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=75,
+            dtype="int8",
+            seq_len=128,
+            head_size=[32, 64],
+            warps_m=2,
+            warps_n=2,
+            version=1,
+            interleaved=False,
+            ldgsts_q=False,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=128,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    specs.append(
+        kernel_spec(
+            sm=sm,
+            sm_mma=75,
+            dtype="int8",
+            seq_len=128,
+            head_size=[32, 64],
+            warps_m=2,
+            warps_n=2,
+            version=2,
+            interleaved=False,
+            ldgsts_q=False,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=128,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+        )
+    )
+
+    # -  S=192: STEP=64, STEP NL=64 FLAGS=0x0
+    # -  S=128: STEP=NA, STEP NL=16 FLAGS=0x8
+    # -  S=96
+    # -  S=64
+
+
+def enumerate_cross_mha_kernels(specs):
+    # TODO: combine cross_mha and mha kernel enumeration
+    # -  S_Q=4096, S_KV=128:  STEP=64, STEP NL=64
+    # HEAD_SIZE: 64
+    # SM 70
+    if "ENABLE_SM70" in os.environ:
+        specs.append(
+            kernel_spec(
+                sm=70,
+                dtype="fp16",
+                seq_len=128,
+                head_size=64,
+                warps_m=2,
+                warps_n=2,
+                version=2,
+                interleaved=False,
+                ldgsts_q=False,
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=False,
+                loop_step=64,
+                has_noloop=1,
+                noloop_step=32,
+                unroll_threshold=1,
+                has_scale_max=False,
+                cross_mha=1,
+            )
+        )
+
+    # SM 75
+    specs.append(
+        kernel_spec(
+            sm=75,
+            dtype="fp16",
+            seq_len=128,
+            head_size=64,
+            warps_m=2,
+            warps_n=2,
+            version=2,
+            interleaved=False,
+            ldgsts_q=False,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=False,
+            loop_step=64,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+            cross_mha=1,
+        )
+    )
+
+    # SM 80
+    specs.append(
+        kernel_spec(
+            sm=80,
+            dtype="fp16",
+            seq_len=128,
+            head_size=64,
+            warps_m=1,
+            warps_n=4,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=True,
+            ldgsts_v=True,
+            share_smem_k_v=False,
+            loop_step=64,
+            has_noloop=1,
+            noloop_step=64,
+            unroll_threshold=1,
+            has_scale_max=False,
+            cross_mha=1,
+        )
+    )
+
+    # SM 86
+    specs.append(
+        kernel_spec(
+            sm=86,
+            dtype="fp16",
+            seq_len=128,
+            head_size=64,
+            warps_m=1,
+            warps_n=4,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=True,
+            ldgsts_v=True,
+            share_smem_k_v=False,
+            loop_step=64,
+            has_noloop=1,
+            noloop_step=64,
+            unroll_threshold=1,
+            has_scale_max=False,
+            cross_mha=1,
+        )
+    )
+
+    # SM 89
+    specs.append(
+        kernel_spec(
+            sm=89,
+            dtype="fp16",
+            seq_len=128,
+            head_size=64,
+            warps_m=1,
+            warps_n=4,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=True,
+            ldgsts_v=True,
+            share_smem_k_v=False,
+            loop_step=64,
+            has_noloop=1,
+            noloop_step=64,
+            unroll_threshold=1,
+            has_scale_max=False,
+            cross_mha=1,
+        )
+    )
+
+    # -  S_Q=1024, S_KV=128:  STEP=64, STEP NL=32
+    # HEAD_SIZE: 128
+    # SM 70
+    if "ENABLE_SM70" in os.environ:
+        specs.append(
+            kernel_spec(
+                sm=70,
+                dtype="fp16",
+                seq_len=128,
+                head_size=128,
+                warps_m=2,
+                warps_n=2,
+                version=2,
+                interleaved=False,
+                ldgsts_q=False,
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=False,
+                loop_step=64,
+                has_noloop=1,
+                noloop_step=32,
+                unroll_threshold=1,
+                has_scale_max=False,
+                cross_mha=1,
+            )
+        )
+
+    # SM 75
+    specs.append(
+        kernel_spec(
+            sm=75,
+            dtype="fp16",
+            seq_len=128,
+            head_size=128,
+            warps_m=1,
+            warps_n=4,
+            version=2,
+            interleaved=False,
+            ldgsts_q=False,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=64,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+            cross_mha=1,
+        )
+    )
+
+    # SM 80
+    specs.append(
+        kernel_spec(
+            sm=80,
+            dtype="fp16",
+            seq_len=128,
+            head_size=128,
+            warps_m=1,
+            warps_n=4,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=True,
+            ldgsts_v=True,
+            share_smem_k_v=False,
+            loop_step=64,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+            cross_mha=1,
+        )
+    )
+
+    # SM 86
+    specs.append(
+        kernel_spec(
+            sm=86,
+            dtype="fp16",
+            seq_len=128,
+            head_size=128,
+            warps_m=1,
+            warps_n=4,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=True,
+            ldgsts_v=True,
+            share_smem_k_v=False,
+            loop_step=64,
+            has_noloop=1,
+            noloop_step=64,
+            unroll_threshold=1,
+            has_scale_max=False,
+            cross_mha=1,
+        )
+    )
+
+    # SM 89
+    specs.append(
+        kernel_spec(
+            sm=89,
+            dtype="fp16",
+            seq_len=128,
+            head_size=128,
+            warps_m=1,
+            warps_n=4,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=True,
+            ldgsts_v=True,
+            share_smem_k_v=False,
+            loop_step=64,
+            has_noloop=1,
+            noloop_step=32,
+            unroll_threshold=1,
+            has_scale_max=False,
+            cross_mha=1,
+        )
+    )
+
+    # -  S_KV=128:  STEP=32, STEP NL=32
+    # HEAD_SIZE: 256
+    # SM 70
+    # specs.append(kernel_spec(sm=70,
+    #     dtype='fp16',
+    #     seq_len=128,
+    #     head_size=256,
+    #     warps_m=2,
+    #     warps_n=2,
+    #     version=2,
+    #     interleaved=False,
+    #     ldgsts_q=False,
+    #     ldgsts_k=False,
+    #     ldgsts_v=False,
+    #     share_smem_k_v=True,
+    #     loop_step= 32,
+    #     has_noloop=1,
+    #     noloop_step=16,
+    #     unroll_threshold=1,
+    #     has_scale_max=False,
+    #     cross_mha=1))
+
+    # # SM 75
+    # specs.append(kernel_spec(sm=75,
+    #     dtype='fp16',
+    #     seq_len=128,
+    #     head_size=256,
+    #     warps_m=1,
+    #     warps_n=8,
+    #     version=2,
+    #     interleaved=False,
+    #     ldgsts_q=False,
+    #     ldgsts_k=False,
+    #     ldgsts_v=False,
+    #     share_smem_k_v=True,
+    #     loop_step= 32,
+    #     has_noloop=1,
+    #     noloop_step=16,
+    #     unroll_threshold=1,
+    #     has_scale_max=False,
+    #     cross_mha=1))
+
+    # SM 80
+    specs.append(
+        kernel_spec(
+            sm=80,
+            dtype="fp16",
+            seq_len=128,
+            head_size=256,
+            warps_m=1,
+            warps_n=8,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=True,
+            ldgsts_v=True,
+            share_smem_k_v=False,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+            cross_mha=1,
+        )
+    )
+
+    # SM 86
+    specs.append(
+        kernel_spec(
+            sm=86,
+            dtype="fp16",
+            seq_len=128,
+            head_size=256,
+            warps_m=1,
+            warps_n=8,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+            cross_mha=1,
+        )
+    )
+
+    # SM 89
+    specs.append(
+        kernel_spec(
+            sm=89,
+            dtype="fp16",
+            seq_len=128,
+            head_size=256,
+            warps_m=1,
+            warps_n=8,
+            version=2,
+            interleaved=False,
+            ldgsts_q=True,
+            ldgsts_k=False,
+            ldgsts_v=False,
+            share_smem_k_v=True,
+            loop_step=32,
+            has_noloop=1,
+            noloop_step=16,
+            unroll_threshold=1,
+            has_scale_max=False,
+            cross_mha=1,
+        )
+    )
+
+
+def enumerate_kernels():
+    if not os.path.exists("./generated"):
+        os.mkdir("./generated")
+
+    specs = []
+
+    # TODO we have to select the unroll_threshold over a grid of b and h for each arch
+
+    # Current fp16 384 kernel does 1x8 (smem limit), STEP=48. FP16 does not currently have noloop.
+
+    # SM 90
+    enumerate_hgmma_tma_kernels(specs, sm=90)
+    enumerate_hgmma_ldgsts_kernels(specs, sm=90, dtype="fp16")
+    enumerate_hgmma_ldgsts_kernels(specs, sm=90, dtype="bf16")
+    if "ENABLE_HMMA_FP32" in os.environ:
+        enumerate_hgmma_ldgsts_kernels(specs, sm=90, dtype="fp16_fp32")
+    enumerate_igmma_kernels(specs, sm=90)
+    enumerate_qgmma_kernels(specs, sm=90)
+    # need to add bf16 kernels if needed
+    enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype="fp16")
+    enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype="bf16")
+    enumerate_qgmma_flash_warpspec_kernels(specs, sm=90, dtype="e4m3")
+    enumerate_qgmma_flash_warpspec_kernels(
+        specs, sm=90, dtype="e4m3", output_dtype="bf16"
+    )
+
+    # For now SageAttention only needs BF16
+    # block_size_q should be divisible by 64
+    # block_size_k should be divisible by 8
+    # block_size_v should be divisible by 32
+    for sage_block_sizes in [
+        (64, 64, 64),
+        (64, 64, 128),
+        (64, 64, 256),
+        (64, 128, 64),
+        (64, 128, 128),
+        (64, 128, 256),
+    ]:
+        enumerate_qgmma_flash_warpspec_kernels(
+            specs,
+            sm=90,
+            dtype="e4m3",
+            sage_block_sizes=sage_block_sizes,
+            output_dtype="bf16",
+        )
+
+    if "ENABLE_HMMA_FP32" in os.environ:
+        enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype="fp16_fp32")
+    # Optionally generate HMMA kernels on SM90 for comparison.
+    if "SM90_USE_HMMA" in os.environ:
+        print("WARNING: GENERATING HMMA KERNELS INSTEAD OF HGMMA FOR SM90")
+        enumerate_hmma_kernels(specs, sm=90, dtype="fp16")
+        enumerate_hmma_kernels(specs, sm=90, dtype="bf16")
+
+    # SM90 IGMMA
+    if "SM90_USE_IMMA" in os.environ:
+        print("WARNING: GENERATING IMMA KERNELS INSTEAD OF IGMMA FOR SM90")
+        enumerate_imma_kernels(specs, sm=90)
+
+    # SM 89
+    if "ENABLE_SM89_QMMA" in os.environ:
+        enumerate_qmma_kernels(specs, sm=89)
+        enumerate_qmma_flash_kernels(specs, sm=89, dtype="e4m3_fp32")
+        # Add bf16 output MLA kernels.
+        enumerate_qmma_flash_kernels(
+            specs, sm=89, dtype="e4m3_fp32", head_sizes=[192, 576], output_dtype="bf16"
+        )
+        # Sage Attention on Ada only supports block_size = (64, 32, 32)
+        enumerate_qmma_flash_kernels(
+            specs,
+            sm=89,
+            dtype="e4m3_fp32",
+            sage_block_sizes=(64, 32, 32),
+            output_dtype="bf16",
+        )
+        enumerate_qmma_flash_kernels(
+            specs,
+            sm=89,
+            dtype="e4m3_fp32",
+            sage_block_sizes=(64, 32, 32),
+            output_dtype="fp16",
+        )
+
+    enumerate_imma_kernels(specs, sm=89)
+    enumerate_hmma_kernels(specs, sm=89, dtype="fp16")
+    enumerate_hmma_kernels(specs, sm=89, dtype="bf16")
+    enumerate_hmma_flash_kernels(specs, sm=89, dtype="fp16")
+    enumerate_hmma_flash_kernels(specs, sm=89, dtype="bf16")
+
+    # SM 80 / 86
+    enumerate_imma_kernels(specs, sm=80)
+    enumerate_hmma_kernels(specs, sm=80, dtype="fp16")
+    enumerate_hmma_kernels(specs, sm=80, dtype="bf16")
+    enumerate_hmma_flash_kernels(specs, sm=80, dtype="fp16")
+    enumerate_hmma_flash_kernels(specs, sm=80, dtype="bf16")
+
+    enumerate_imma_kernels(specs, sm=86)
+    enumerate_hmma_kernels(specs, sm=86, dtype="fp16")
+    enumerate_hmma_kernels(specs, sm=86, dtype="bf16")
+    enumerate_hmma_flash_kernels(specs, sm=86, dtype="fp16")
+    enumerate_hmma_flash_kernels(specs, sm=86, dtype="bf16")
+
+    # SM 90 (only generate paged_kv_fmha hmma kernels)
+    enumerate_hmma_paged_kv_flash_kernels(specs, sm=90, dtype="fp16")
+    enumerate_hmma_paged_kv_flash_kernels(specs, sm=90, dtype="bf16")
+
+    if "ENABLE_SM100" in os.environ:
+        # SM 100
+        enumerate_hmma_flash_kernels(specs, sm=100, dtype="fp16")
+        enumerate_hmma_flash_kernels(specs, sm=100, dtype="bf16")
+        enumerate_hmma_flash_kernels(specs, sm=100, dtype="bf16", head_size_v=128)
+        enumerate_hmma_flash_kernels(specs, sm=100, dtype="bf16", head_size_v=512)
+
+    if "ENABLE_SM120" in os.environ:
+        # SM 120
+        enumerate_hmma_flash_kernels(specs, sm=120, dtype="fp16")
+        enumerate_hmma_flash_kernels(specs, sm=120, dtype="bf16")
+        enumerate_hmma_flash_kernels(specs, sm=120, dtype="bf16", head_size_v=128)
+        enumerate_hmma_flash_kernels(specs, sm=120, dtype="bf16", head_size_v=512)
+        enumerate_qmma_kernels(specs, sm=120)
+        enumerate_qmma_flash_kernels(specs, sm=120, dtype="e4m3_fp32")
+        # Add bf16 output MLA kernels.
+        enumerate_qmma_flash_kernels(
+            specs, sm=120, dtype="e4m3_fp32", head_sizes=[192, 576], output_dtype="bf16"
+        )
+
+    if "ENABLE_HMMA_FP32" in os.environ:
+        enumerate_hmma_flash_kernels(specs, sm=80, dtype="fp16_fp32")
+        enumerate_hmma_flash_kernels(specs, sm=86, dtype="fp16_fp32")
+        enumerate_hmma_flash_kernels(specs, sm=89, dtype="fp16_fp32")
+        # SM 90 (only generate paged_kv_fmha hmma kernels)
+        enumerate_hmma_paged_kv_flash_kernels(specs, sm=90, dtype="fp16_fp32")
+        if "ENABLE_SM100" in os.environ:
+            # SM 100
+            enumerate_hmma_flash_kernels(specs, sm=100, dtype="fp16_fp32")
+        if "ENABLE_SM120" in os.environ:
+            # SM 120
+            enumerate_hmma_flash_kernels(specs, sm=120, dtype="fp16_fp32")
+
+    for sm in [80, 86, 89, 90]:
+        if not (sm == 90 and "GENERATE_CUBIN" in os.environ):
+            # Hopper uses warp-specialized kernels instead (hasn't been merged yet).
+            enumerate_hmma_flash_kernels(specs, sm=sm, dtype="bf16", head_size_v=128)
+        enumerate_hmma_flash_kernels(specs, sm=sm, dtype="bf16", head_size_v=512)
+
+    # SM 75
+    enumerate_imma_kernels(specs, sm=75)
+    enumerate_hmma_kernels(specs, sm=75)
+    enumerate_hmma_flash_kernels(specs, sm=75)
+
+    # SM 70
+    if "ENABLE_SM70" in os.environ:
+        enumerate_hmma884_kernels(specs, sm=70)
+        enumerate_hmma_flash_kernels(specs, sm=70)
+
+    # TODO: refactor this; maybe adding a option to enumerate_*mma_kernels()
+    enumerate_cross_mha_kernels(specs)
+
+    # Expand the cartesian product of the list fields "seq_len" and "head_size".
+    specs_expanded = []
+    list_like = lambda x: isinstance(x, (list, tuple))
+    for kspec in specs:
+        tmp_s = kspec.seq_len
+        tmp_d = kspec.head_size
+        tmp_dtype = kspec.dtype
+        tmp_exp = (
+            [kspec._replace(seq_len=s) for s in tmp_s] if list_like(tmp_s) else [kspec]
+        )
+        tmp_exp = (
+            [tmp_ks._replace(head_size=d) for d in tmp_d for tmp_ks in tmp_exp]
+            if list_like(tmp_d)
+            else tmp_exp
+        )
+        tmp_exp = (
+            [tmp_ks._replace(dtype=dt) for dt in tmp_dtype for tmp_ks in tmp_exp]
+            if list_like(tmp_dtype)
+            else tmp_exp
+        )
+        specs_expanded.extend(tmp_exp)
+
+    # Sanitize kernel specs
+    specs_expanded = [kspec for kspec in specs_expanded if kspec.sm >= kspec.sm_mma]
+
+    # Expand the list for the cross-MHA kernels.
+    # TRT-LLM uses the head_interleaved=False mode.
+    if "GENERATE_CUBIN" in os.environ:
+        specs_expanded = [
+            kspec._replace(head_interleaved=False) for kspec in specs_expanded
+        ]
+    # yapf: disable
+    specs_names = [(kspec, *encode_name(kspec)) for kspec in specs_expanded
+                  # Volta is deprecated in TRT-LLM.
+                  if  (kspec.sm            in [80, 86, 89, 90, 120]
+                  and kspec.dtype         in ['fp16', 'bf16', 'fp16_fp32', 'e4m3', 'e4m3_fp32']
+                  and kspec.head_size     <= 256
+                  and kspec.head_size_v   == 0
+                  and kspec.sage_block_sizes is None
+                  and kspec.version       == 2
+                  and not kspec.cross_mha
+                  and kspec.flash_attention
+                  and kspec.input_layout != InputLayout.SEPARATE_Q_K_V
+                  or (kspec.sm == 90
+                  and kspec.dtype         in ['fp16', 'bf16', 'fp16_fp32']
+                  and kspec.head_size     <= 256
+                  and kspec.ldgsts_q
+                  and kspec.version       == 2
+                  and not kspec.cross_mha
+                  and not kspec.flash_attention)
+                  # Clip/SigLip support.
+                  or  (kspec.sm           == 100
+                  and kspec.dtype         in ['fp16', 'bf16', 'fp16_fp32', 'e4m3', 'e4m3_fp32']
+                  and kspec.head_size     == 80
+                  and kspec.head_size_v   == 0
+                  and kspec.sage_block_sizes is None
+                  and kspec.version       == 2
+                  and not kspec.cross_mha
+                  and kspec.flash_attention
+                  and kspec.input_layout != InputLayout.SEPARATE_Q_K_V)
+                  # Deepseek MLA (generation 576/512 paged)
+                  or (kspec.sm            in [90, 100, 120]
+                  and kspec.dtype         in ['bf16', 'e4m3_fp32']
+                  and kspec.head_size     == 576
+                  and kspec.head_size_v   == 512
+                  and kspec.input_layout == InputLayout.Q_PAGED_KV
+                  and kspec.sage_block_sizes is None
+                  and kspec.version       == 2
+                  and not kspec.cross_mha
+                  and kspec.flash_attention
+                  and not kspec.warp_specialization
+                  and kspec.tiled)
+                  # Deepseek MLA (context 192/128 separate-q-k-v)
+                  or (kspec.sm            in [90, 100, 120]
+                  and kspec.dtype         in ['bf16', 'e4m3', 'e4m3_fp32']
+                  and kspec.head_size     == 192
+                  and kspec.head_size_v   == 128
+                  and kspec.input_layout == InputLayout.SEPARATE_Q_K_V
+                  and kspec.sage_block_sizes is None
+                  and kspec.version       == 2
+                  and not kspec.cross_mha
+                  and kspec.flash_attention
+                  and ((kspec.warp_specialization and not kspec.alibi)   # sm90
+                    or (not kspec.warp_specialization and kspec.tiled))  # non-sm90
+                  and not kspec.enable_attn_logit_softcapping)
+                  # SageAttention (warp_spec, head_size in (80, 128), packed QKV, padding mask)
+                  or (kspec.sm            == 90
+                  and kspec.head_size     in [80, 128]
+                  and kspec.version       == 2
+                  and kspec.sage_block_sizes in [(64, 64, 256)]
+                  and not kspec.cross_mha
+                  and kspec.flash_attention
+                  and kspec.warp_specialization
+                  and kspec.input_layout == InputLayout.PACKED_QKV
+                  and not kspec.alibi
+                  and not kspec.enable_attn_logit_softcapping)
+                  # SageAttention on Ada (head_size in (80, 128), packed QKV, padding mask)
+                  or (kspec.sm            == 89
+                  and kspec.head_size     in [80, 128]
+                  and kspec.sage_block_sizes in [(64, 32, 32)]
+                  and kspec.output_dtype in ['fp16', 'bf16']
+                  and kspec.version       == 2
+                  and not kspec.cross_mha
+                  and kspec.flash_attention
+                  and not kspec.warp_specialization
+                  and kspec.input_layout == InputLayout.PACKED_QKV))
+                  # only generate head_size = 128/256 for attn_logit_softcapping operation.
+                  and (kspec.head_size == 128 or kspec.head_size == 256 or not kspec.enable_attn_logit_softcapping)]
+    # yapf: enable
+
+    generate_files(specs_names)
diff --git a/flashinfer/jit/attention/modules.py b/flashinfer/jit/attention/modules.py
old mode 100644
new mode 100755
index 475acdcd1e..d596695ad1
--- a/flashinfer/jit/attention/modules.py
+++ b/flashinfer/jit/attention/modules.py
@@ -37,6 +37,7 @@
     write_if_different,
 )
 from .utils import generate_additional_params
+from .fmha_v2.generate_kernels import enumerate_kernels
 
 
 def get_single_decode_uri(
@@ -529,17 +530,26 @@ def gen_single_prefill_module(
         variant_decl = "#include<flashinfer/attention/variants.cuh>"
     else:
         if not fp8_enabled:
-            additional_tensor_names = []
-            additional_tensor_dtypes = []
-            additional_scalar_names = ["logits_soft_cap", "sm_scale"]
-            additional_scalar_dtypes = ["double", "double"]
+            additional_tensor_names = ["maybe_scale_v"]
+            additional_tensor_dtypes = ["float"]
+            additional_scalar_names = ["logits_soft_cap", "sm_scale", "scale_v_scalar"]
+            additional_scalar_dtypes = ["double", "double", "double"]
             variant_name = f"DefaultAttention<{str(use_logits_soft_cap).lower()}>"
             variant_decl = "#include<flashinfer/attention/hopper/variants.cuh>"
         else:
-            additional_tensor_names = ["scale_q", "scale_k", "scale_v"]
+            additional_tensor_names = [
+                "maybe_scale_q",
+                "maybe_scale_k",
+                "maybe_scale_v",
+            ]
             additional_tensor_dtypes = ["float", "float", "float"]
-            additional_scalar_names = ["sm_scale"]
-            additional_scalar_dtypes = ["double"]
+            additional_scalar_names = [
+                "sm_scale",
+                "scale_q_scalar",
+                "scale_k_scalar",
+                "scale_v_scalar",
+            ]
+            additional_scalar_dtypes = ["double", "double", "double", "double"]
             variant_name = "DefaultFP8Attention"
             variant_decl = "#include<flashinfer/attention/hopper/variants.cuh>"
 
@@ -630,6 +640,71 @@ def gen_pod_module(
     )
 
 
+def gen_batch_pod_module(
+    dtype_q: torch.dtype,
+    dtype_kv: torch.dtype,
+    dtype_o: torch.dtype,
+    head_dim: int,
+    pos_encoding_mode_p: int,
+    use_sliding_window_p: bool,
+    use_logits_soft_cap_p: bool,
+    use_fp16_qk_reduction: bool,
+    dtype_idx: torch.dtype,
+    pos_encoding_mode_d: int,
+    use_sliding_window_d: bool,
+    use_logits_soft_cap_d: bool,
+) -> JitSpec:
+    uri = "batch_" + get_pod_uri(
+        dtype_q,
+        dtype_kv,
+        dtype_o,
+        head_dim,
+        pos_encoding_mode_p,
+        use_sliding_window_p,
+        use_logits_soft_cap_p,
+        use_fp16_qk_reduction,
+        dtype_idx,
+        pos_encoding_mode_d,
+        use_sliding_window_d,
+        use_logits_soft_cap_d,
+    )
+    additional_tensor_names = ["maybe_custom_mask", "maybe_alibi_slopes"]
+    additional_tensor_dtypes = ["uint8_t", "float"]
+    additional_scalar_names = [
+        "logits_soft_cap",
+        "sm_scale",
+        "rope_rcp_scale",
+        "rope_rcp_theta",
+    ]
+    additional_scalar_dtypes = ["float", "float", "float", "float"]
+    variant_name_p = f"DefaultAttention<use_custom_mask_p, {str(use_sliding_window_p).lower()}, {str(use_logits_soft_cap_p).lower()}, {str(pos_encoding_mode_p == 2).lower()}>"
+    variant_name_d = f"DefaultAttention<use_custom_mask_d, {str(use_sliding_window_d).lower()}, {str(use_logits_soft_cap_d).lower()}, {str(pos_encoding_mode_d == 2).lower()}>"
+    variant_decl = "#include<flashinfer/attention/variants.cuh>"
+
+    return gen_customize_batch_pod_module(
+        uri,
+        dtype_q,
+        dtype_kv,
+        dtype_o,
+        dtype_idx,
+        head_dim,
+        additional_tensor_names,
+        additional_tensor_dtypes,
+        additional_scalar_names,
+        additional_scalar_dtypes,
+        variant_name_p,
+        variant_name_d,
+        variant_decl,
+        pos_encoding_mode_p=pos_encoding_mode_p,
+        use_sliding_window_p=use_sliding_window_p,
+        use_logits_soft_cap_p=use_logits_soft_cap_p,
+        pos_encoding_mode_d=pos_encoding_mode_d,
+        use_sliding_window_d=use_sliding_window_d,
+        use_logits_soft_cap_d=use_logits_soft_cap_d,
+        use_fp16_qk_reduction=use_fp16_qk_reduction,
+    )
+
+
 def gen_customize_pod_module(
     uri: str,
     dtype_q: torch.dtype,
@@ -698,6 +773,8 @@ def gen_customize_pod_module(
     )
 
     os.makedirs(gen_directory, exist_ok=True)
+    generated_config_path = gen_directory / "pod_config.inc"
+    write_if_different(generated_config_path, generated_inc_str)
 
     source_paths = []
 
@@ -725,8 +802,106 @@ def gen_customize_pod_module(
             source = f.read()
         write_if_different(dest_path, source)
 
-    generated_config_path = gen_directory / "pod_config.inc"
+    return gen_jit_spec(uri, source_paths)
+
+
+def gen_customize_batch_pod_module(
+    uri: str,
+    dtype_q: torch.dtype,
+    dtype_kv: torch.dtype,
+    dtype_o: torch.dtype,
+    dtype_idx: torch.dtype,
+    head_dim: int,
+    additional_tensor_names: List[str],
+    additional_tensor_dtypes: List[str],
+    additional_scalar_names: List[str],
+    additional_scalar_dtypes: List[str],
+    variant_name_p: str,
+    variant_name_d: str,
+    variant_decl: str,
+    pos_encoding_mode_p: int = 0,
+    use_sliding_window_p: bool = False,
+    use_logits_soft_cap_p: bool = False,
+    pos_encoding_mode_d: int = 0,
+    use_sliding_window_d: bool = False,
+    use_logits_soft_cap_d: bool = False,
+    use_fp16_qk_reduction: bool = False,
+) -> JitSpec:
+    gen_directory = jit_env.FLASHINFER_GEN_SRC_DIR / uri
+
+    (
+        additional_params_decl,
+        additional_func_params,
+        additional_params_setter,
+    ) = generate_additional_params(
+        additional_tensor_names,
+        additional_tensor_dtypes,
+        additional_scalar_names,
+        additional_scalar_dtypes,
+    )
+
+    with open(jit_env.FLASHINFER_CSRC_DIR / "batch_pod_customize_config.jinja") as f:
+        config_templ = jinja2.Template(f.read())
+
+    with open(jit_env.FLASHINFER_CSRC_DIR / "batch_pod_kernel_inst.jinja") as f:
+        kernel_inst_templ = jinja2.Template(f.read())
+
+    kwargs = {
+        "additional_func_params": additional_func_params,
+        "additional_params_decl": additional_params_decl,
+        "additional_params_setter": additional_params_setter,
+        "variant_decl": variant_decl,
+        "variant_name_p": variant_name_p,
+        "variant_name_d": variant_name_d,
+        "dtype_q": dtype_map[dtype_q],
+        "dtype_kv": dtype_map[dtype_kv],
+        "dtype_o": dtype_map[dtype_o],
+        "idtype": dtype_map[dtype_idx],
+        "head_dim_qk": head_dim,
+        "head_dim_vo": head_dim,
+        "pos_encoding_mode_p": pos_encoding_mode_literal[pos_encoding_mode_p],
+        "pos_encoding_mode_d": pos_encoding_mode_literal[pos_encoding_mode_d],
+        "use_sliding_window_p": str(use_sliding_window_p).lower(),
+        "use_logits_soft_cap_p": str(use_logits_soft_cap_p).lower(),
+        "use_sliding_window_d": str(use_sliding_window_d).lower(),
+        "use_logits_soft_cap_d": str(use_logits_soft_cap_d).lower(),
+        "use_fp16_qk_reduction": str(use_fp16_qk_reduction).lower(),
+    }
+
+    generated_inc_str = config_templ.render(
+        **kwargs,
+    )
+
+    os.makedirs(gen_directory, exist_ok=True)
+    generated_config_path = gen_directory / "batch_pod_config.inc"
     write_if_different(generated_config_path, generated_inc_str)
+
+    source_paths = []
+
+    for mask_mode_p in [0, 1, 2, 3]:
+        for mask_mode_d in [0, 1, 2, 3]:
+            kwargs["mask_mode_p"] = mask_mode_literal[mask_mode_p]
+            kwargs["mask_mode_d"] = mask_mode_literal[mask_mode_d]
+
+            filename = f"batch_pod_kernel_mask_{mask_mode_p}p_{mask_mode_d}d.cu"
+            dest_path = gen_directory / filename
+            source_paths.append(dest_path)
+            source = kernel_inst_templ.render(
+                **kwargs,
+            )
+            write_if_different(dest_path, source)
+
+    for filename in [
+        "batch_pod.cu",
+        "batch_pod_jit_binding.cu",
+    ]:
+        src_path = jit_env.FLASHINFER_CSRC_DIR / filename
+        dest_path = gen_directory / filename
+        source_paths.append(dest_path)
+        with open(src_path, "r") as f:
+            source = f.read()
+        write_if_different(dest_path, source)
+
     return gen_jit_spec(uri, source_paths)
 
 
@@ -843,21 +1018,32 @@ def gen_batch_prefill_module(
                 "maybe_prefix_len_ptr",
                 "maybe_token_pos_in_items_ptr",
                 "maybe_max_item_len_ptr",
+                "maybe_scale_v",
             ]
-            additional_tensor_dtypes = ["uint32_t", "uint16_t", "uint16_t"]
+            additional_tensor_dtypes = ["uint32_t", "uint16_t", "uint16_t", "float"]
             additional_scalar_names = [
                 "logits_soft_cap",
                 "sm_scale",
+                "scale_v_scalar",
                 "token_pos_in_items_len",
             ]
-            additional_scalar_dtypes = ["double", "double", "int64_t"]
+            additional_scalar_dtypes = ["double", "double", "double", "int64_t"]
             variant_name = f"DefaultAttention<{str(use_logits_soft_cap).lower()}>"
             variant_decl = "#include<flashinfer/attention/hopper/variants.cuh>"
         else:
-            additional_tensor_names = ["scale_q", "scale_k", "scale_v"]
+            additional_tensor_names = [
+                "maybe_scale_q",
+                "maybe_scale_k",
+                "maybe_scale_v",
+            ]
             additional_tensor_dtypes = ["float", "float", "float"]
-            additional_scalar_names = ["sm_scale"]
-            additional_scalar_dtypes = ["double"]
+            additional_scalar_names = [
+                "sm_scale",
+                "scale_q_scalar",
+                "scale_k_scalar",
+                "scale_v_scalar",
+            ]
+            additional_scalar_dtypes = ["double", "double", "double", "double"]
             variant_name = "DefaultFP8Attention"
             variant_decl = "#include<flashinfer/attention/hopper/variants.cuh>"
 
@@ -1704,3 +1890,44 @@ def gen_cudnn_fmha_module():
             f'-DCUDNN_SDPA_CUBIN_PATH=\\"{ArtifactPath.CUDNN_SDPA}\\"',
         ],
     )
+
+
+def get_trtllm_fmha_v2_module():
+    module = gen_trtllm_fmha_v2_module().build_and_load()
+    return module
+
+
+def gen_trtllm_fmha_v2_module() -> JitSpec:
+    uri = "trtllm_fmha_v2"
+    cached_ops = jit_env.FLASHINFER_JIT_DIR / uri
+    cached_ops.mkdir(parents=True, exist_ok=True)
+
+    fmha_v2_src_dir = jit_env.FLASHINFER_CSRC_DIR / "fmha_v2"
+
+    # Generate kernel source
+    enumerate_kernels(fmha_v2_src_dir, cached_ops)
+
+    kernels = [
+        "fmha_v2_flash_attention_bf16_64_128_S_q_k_v_192x128_sm120.cu",
+        "fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_k_v_192x128_output_bf16_sm120.cu",
+        "fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_k_v_192x128_sm120.cu",
+    ]
+
+    kernel_paths = [
+        jit_env.FLASHINFER_JIT_DIR / "trtllm_fmha_v2" / "generated" / kernel
+        for kernel in kernels
+    ]
+    binding_source_path = jit_env.FLASHINFER_CSRC_DIR / "trtllm_fmha_v2_binding.cu"
+    source_paths = kernel_paths + [binding_source_path]
+
+    nvcc_flags = current_compilation_context.get_nvcc_flags_list(
+        supported_major_versions=[12]
+    )
+    nvcc_flags.append(f"-I{jit_env.FLASHINFER_CSRC_DIR / 'fmha_v2'}")
+    nvcc_flags.append("-Wno-deprecated-gpu-targets")
+
+    return gen_jit_spec(
+        uri,
+        source_paths,
+        extra_cuda_cflags=nvcc_flags,
+    )
diff --git a/flashinfer/jit/attention/utils.py b/flashinfer/jit/attention/utils.py
index ac033a65b8..2f2e030b12 100644
--- a/flashinfer/jit/attention/utils.py
+++ b/flashinfer/jit/attention/utils.py
@@ -30,11 +30,14 @@ def generate_additional_params(
             for dtype, var in zip(
                 additional_tensor_dtypes,
                 additional_tensor_names,
+                strict=True,
             )
         ]
         + [
             f"{dtype} {var};\n"
-            for dtype, var in zip(additional_scalar_dtypes, additional_scalar_names)
+            for dtype, var in zip(
+                additional_scalar_dtypes, additional_scalar_names, strict=True
+            )
         ]
     )
     additional_func_params = "".join(
@@ -48,7 +51,9 @@ def generate_additional_params(
         ]
         + [
             f", {dtype} {var}"
-            for dtype, var in zip(additional_scalar_dtypes, additional_scalar_names)
+            for dtype, var in zip(
+                additional_scalar_dtypes, additional_scalar_names, strict=True
+            )
         ]
     )
     if is_sm90_template:
@@ -59,7 +64,9 @@ def generate_additional_params(
                     if var.startswith("maybe")
                     else f"params.additional_params.{var} = static_cast<{dtype}*>({var}.data_ptr());"
                 )
-                for dtype, var in zip(additional_tensor_dtypes, additional_tensor_names)
+                for dtype, var in zip(
+                    additional_tensor_dtypes, additional_tensor_names, strict=True
+                )
             ]
             + [
                 f"params.additional_params.{var} = {var};"
@@ -74,7 +81,9 @@ def generate_additional_params(
                     if var.startswith("maybe")
                     else f"params.{var} = static_cast<{dtype}*>({var}.data_ptr());"
                 )
-                for dtype, var in zip(additional_tensor_dtypes, additional_tensor_names)
+                for dtype, var in zip(
+                    additional_tensor_dtypes, additional_tensor_names, strict=True
+                )
             ]
             + [f"params.{var} = {var};" for var in additional_scalar_names]
         )
diff --git a/flashinfer/jit/attention/variants.py b/flashinfer/jit/attention/variants.py
index 16ee1c4f3f..6148139478 100644
--- a/flashinfer/jit/attention/variants.py
+++ b/flashinfer/jit/attention/variants.py
@@ -142,15 +142,17 @@
 struct AttentionSink : AttentionVariantBase {
   float sm_scale_log2;
   float log_sink;
+  float scale_pv;
   int qo_len, kv_len;
 
   // Init
   template <typename MainloopParams, typename BlockCoord>
   __device__ __host__ AttentionSink(const MainloopParams& params, const BlockCoord& block_coord) {
     sm_scale_log2 = params.additional_params.sm_scale * math::log2e;
-    auto [_, qo_head_idx, __, ___, ____, qo_len_, kv_len_, batch_idx] =
+    auto [_, qo_head_idx, kv_head_idx, ___, ____, qo_len_, kv_len_, batch_idx] =
         block_coord;
     log_sink = params.additional_params.sink[qo_head_idx] * math::log2e;
+    scale_pv = get_v_scale(params.additional_params, kv_head_idx);
 
     qo_len = qo_len_;
     kv_len = kv_len_;
diff --git a/flashinfer/jit/core.py b/flashinfer/jit/core.py
index e7dec73723..27034a4054 100644
--- a/flashinfer/jit/core.py
+++ b/flashinfer/jit/core.py
@@ -1,10 +1,11 @@
 import dataclasses
+import functools
 import logging
 import os
 from contextlib import nullcontext
 from datetime import datetime
 from pathlib import Path
-from typing import Dict, List, Optional, Sequence, Union
+from typing import Dict, List, Optional, Sequence, Union, Hashable
 
 import tvm_ffi
 from filelock import FileLock
@@ -60,6 +61,33 @@ def __init__(self, name):
             )
         )
 
+    def debug_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`debug`][logging.Logger.debug], but subsequent calls with
+        the same message are silently dropped.
+        """
+        self._print_once(self.debug, msg, *args)
+
+    def info_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`info`][logging.Logger.info], but subsequent calls with
+        the same message are silently dropped.
+        """
+        self._print_once(self.info, msg, *args)
+
+    def warning_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`warning`][logging.Logger.warning], but subsequent calls with
+        the same message are silently dropped.
+        """
+        self._print_once(self.warning, msg, *args)
+
+    @functools.lru_cache(maxsize=None)
+    def _print_once(self, log_method, msg: str, *args: Hashable) -> None:
+        """Helper method to log messages only once per unique (msg, args) combination."""
+        # Note: stacklevel=3 to show the caller's location, not this helper method
+        log_method(msg, *args, stacklevel=3)
+
 
 logger = FlashInferJITLogger("flashinfer.jit")
 
diff --git a/flashinfer/jit/dsv3_optimizations.py b/flashinfer/jit/dsv3_optimizations.py
new file mode 100644
index 0000000000..9aa720fa59
--- /dev/null
+++ b/flashinfer/jit/dsv3_optimizations.py
@@ -0,0 +1,45 @@
+from .core import JitSpec, gen_jit_spec
+from . import env as jit_env
+
+
+def gen_dsv3_router_gemm_module() -> JitSpec:
+    return gen_jit_spec(
+        "dsv3_router_gemm",
+        [
+            jit_env.FLASHINFER_CSRC_DIR / "dsv3_router_gemm.cu",
+        ],
+    )
+
+
+def gen_dsv3_fused_routing_module() -> JitSpec:
+    return gen_jit_spec(
+        "dsv3_fused_routing",
+        [
+            jit_env.FLASHINFER_CSRC_DIR / "fused_moe/noAuxTcKernels.cu",
+            jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/envUtils.cpp",
+            jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/logger.cpp",
+            jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/stringUtils.cpp",
+            jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/tllmException.cpp",
+            jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/memoryUtils.cu",
+        ],
+        extra_include_paths=[
+            jit_env.FLASHINFER_CSRC_DIR / "nv_internal",
+            jit_env.FLASHINFER_CSRC_DIR / "nv_internal" / "include",
+            jit_env.FLASHINFER_CSRC_DIR
+            / "nv_internal"
+            / "tensorrt_llm"
+            / "cutlass_extensions"
+            / "include",
+            jit_env.FLASHINFER_CSRC_DIR
+            / "nv_internal"
+            / "tensorrt_llm"
+            / "kernels"
+            / "cutlass_kernels"
+            / "include",
+            jit_env.FLASHINFER_CSRC_DIR
+            / "nv_internal"
+            / "tensorrt_llm"
+            / "kernels"
+            / "cutlass_kernels",
+        ],
+    )
diff --git a/flashinfer/jit/fused_moe.py b/flashinfer/jit/fused_moe.py
index 11398fabd9..e890a76681 100644
--- a/flashinfer/jit/fused_moe.py
+++ b/flashinfer/jit/fused_moe.py
@@ -47,6 +47,24 @@ def gen_cutlass_fused_moe_sm120_module(use_fast_build: bool = False) -> JitSpec:
     return gen_cutlass_fused_moe_module(nvcc_flags, "120", use_fast_build)
 
 
+def gen_cutlass_fused_moe_sm103_module(use_fast_build: bool = False) -> JitSpec:
+    nvcc_flags = [
+        "-DCOMPILE_BLACKWELL_TMA_GEMMS",
+        "-DCOMPILE_BLACKWELL_TMA_GROUPED_GEMMS",
+        "-DENABLE_BF16",
+        "-DENABLE_FP8",
+        "-DENABLE_FP4",
+        "-DUSING_OSS_CUTLASS_MOE_GEMM",
+        "-DCOMPILE_BLACKWELL_SM103_TMA_GROUPED_GEMMS",
+    ]
+
+    nvcc_flags += current_compilation_context.get_nvcc_flags_list(
+        supported_major_versions=[10]
+    )
+
+    return gen_cutlass_fused_moe_module(nvcc_flags, "103", use_fast_build)
+
+
 def gen_cutlass_fused_moe_sm100_module(use_fast_build: bool = False) -> JitSpec:
     nvcc_flags = [
         "-DCOMPILE_BLACKWELL_TMA_GEMMS",
@@ -146,7 +164,9 @@ def gen_cutlass_fused_moe_module(
             jit_env.FLASHINFER_CSRC_DIR
             / "nv_internal/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.cu",
             jit_env.FLASHINFER_CSRC_DIR
-            / "fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_binding.cu",
+            / "fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_binding.cu",
+            jit_env.FLASHINFER_CSRC_DIR
+            / "fused_moe/cutlass_backend/deepgemm_jit_setup.cu",
             jit_env.FLASHINFER_CSRC_DIR
             / "fused_moe/cutlass_backend/cutlass_fused_moe_instantiation.cu",
             # Add all generated kernels
@@ -233,11 +253,12 @@ def gen_trtllm_gen_fused_moe_sm100_module() -> JitSpec:
         ],
         extra_cuda_cflags=[
             "-DTLLM_GEN_EXPORT_INTERFACE",
+            "-DTLLM_GEN_EXPORT_FLASHINFER",
             "-DTLLM_ENABLE_CUDA",
             "-DENABLE_BF16",
             "-DENABLE_FP8",
             "-DENABLE_FP4",
-            f'-DTLLM_GEN_BMM_CUBIN_PATH=\\"{ArtifactPath.TRTLLM_GEN_BMM}\\"',
+            f'-DTLLM_GEN_GEMM_CUBIN_PATH=\\"{ArtifactPath.TRTLLM_GEN_BMM}\\"',
         ]
         + nvcc_flags,
         extra_include_paths=[
diff --git a/flashinfer/jit/gemm/core.py b/flashinfer/jit/gemm/core.py
index 6564aefa35..7873d0de14 100644
--- a/flashinfer/jit/gemm/core.py
+++ b/flashinfer/jit/gemm/core.py
@@ -381,6 +381,7 @@ def gen_trtllm_gen_gemm_module() -> JitSpec:
         ],
         extra_cuda_cflags=[
             "-DTLLM_GEN_EXPORT_INTERFACE",
+            "-DTLLM_GEN_EXPORT_FLASHINFER",
             "-DTLLM_ENABLE_CUDA",
             f'-DTLLM_GEN_GEMM_CUBIN_PATH=\\"{ArtifactPath.TRTLLM_GEN_GEMM}\\"',
         ]
@@ -531,6 +532,7 @@ def gen_trtllm_low_latency_gemm_module() -> JitSpec:
         ],
         extra_cuda_cflags=[
             "-DTLLM_GEN_EXPORT_INTERFACE",
+            "-DTLLM_GEN_EXPORT_FLASHINFER",
             "-DTLLM_ENABLE_CUDA",
             f'-DTLLM_GEN_GEMM_CUBIN_PATH=\\"{ArtifactPath.TRTLLM_GEN_GEMM}\\"',
         ]
diff --git a/flashinfer/jit/gemm/cutlass/generate_kernels.py b/flashinfer/jit/gemm/cutlass/generate_kernels.py
index e767361a65..f7a87bedbd 100644
--- a/flashinfer/jit/gemm/cutlass/generate_kernels.py
+++ b/flashinfer/jit/gemm/cutlass/generate_kernels.py
@@ -144,6 +144,8 @@ def __init__(
         epi_schedule,
         epi_fusion=None,
         is_mx_fpx=False,
+        dynamic_cga=False,
+        swap_ab=False,
     ):
         self.gemm_kind = gemm_kind
         self.arch = arch
@@ -158,10 +160,12 @@ def __init__(
         self.warp_shape = warp_shape
         self.stages = stages
         self.cga_shape = cga_shape
+        self.dynamic_cga = dynamic_cga
         self.mainloop_schedule = mainloop_schedule
         self.epi_schedule = epi_schedule
         self.epi_fusion = epi_fusion
         self.is_mx_fpx = is_mx_fpx
+        self.swap_ab = swap_ab
 
     def __repr__(self):
         kernel_prefix = "{}_sm{}_{}_{}_{}_{}_{}_{}_{}_{}x{}x{}_{}x{}x{}_{}".format(
@@ -183,13 +187,15 @@ def __repr__(self):
             self.stages,
         )
 
-        hopper_suffix = "_{}x{}x{}{}{}{}".format(
+        hopper_suffix = "_{}x{}x{}{}{}{}{}{}".format(
             self.cga_shape[0],
             self.cga_shape[1],
             self.cga_shape[2],
             KernelScheduleSuffixes[self.mainloop_schedule],
             EpilogueScheduleSuffixes[self.epi_schedule],
             EpiFusionSuffixes[self.epi_fusion],
+            "_mxfpx_" if self.is_mx_fpx else "",
+            "_swap_ab" if self.swap_ab else "",
         )
 
         if self.arch >= 90:
@@ -217,7 +223,9 @@ def instantiate_operation_tma_warp_specialized(operation):
     cute_cga_shape = tuple_to_cute_shape(operation.cga_shape)
 
     kernel_sched = KernelScheduleTag[operation.mainloop_schedule]
-    epi_sched = EpilogueScheduleTag[operation.epi_schedule]
+    epi_sched = "void"
+    if operation.epi_schedule is not None:
+        epi_sched = EpilogueScheduleTag[operation.epi_schedule]
 
     if operation.gemm_kind == GemmKind.Gemm:
         weight_tag = DataTypeTag[operation.weight_type]
@@ -228,8 +236,7 @@ def instantiate_operation_tma_warp_specialized(operation):
 {kernel_sched}, {epi_sched}> (
 const {act_tag}*, const {weight_tag}*, const {scale_zero_tag}*, const {scale_zero_tag}*, const {bias_tag}*, const float,
 {out_tag}*, int, int, int, const int, tensorrt_llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
-);
-"""
+);"""
     elif operation.gemm_kind == GemmKind.Grouped:
         if operation.act_type != operation.weight_type and (
             operation.act_type != DataType.e4m3 or operation.weight_type != e2m1
@@ -247,18 +254,21 @@ def instantiate_operation_tma_warp_specialized(operation):
                 KernelScheduleType.TmaWarpSpecializedCooperative,
                 KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum,
             ]
-            assert operation.epi_schedule == EpilogueScheduleType.NoSmemWarpSpecialized
             kernel_sched.replace("::Kernel", "::KernelGrouped")
-            epi_sched += "Grouped"
-
+            # epi_sched += "Grouped"
             # arch_tag = f"cutlass::arch::Sm{operation.arch}"
             arch_tag = f"Sm{operation.arch}"
             weight_tag = CudaTypeName[operation.weight_type]
             assert operation.epi_fusion is not None
             epi_fusion = EpiFusion[operation.epi_fusion]
 
+            # We need to remove the '::' because this will break the instantiation macro
             epi_fusion = epi_fusion.split(":")[-1]
             epi_tag = epi_tag.split(":")[-1]
+            epi_sched = epi_sched.split(":")[-1]
+            epi_sched = epi_sched.replace(
+                "1Sm", ""
+            )  # Hack to WAR missing `PtrArrayTmaWarpSpecialized` type
 
             guard_map = {
                 e2m1: "defined(ENABLE_FP4)",
@@ -267,6 +277,11 @@ def instantiate_operation_tma_warp_specialized(operation):
             }
             guard_act = guard_map.get(operation.act_type, "1")
             guard_weight = guard_map.get(operation.weight_type, "1")
+
+            is_mx_fpx = str(operation.is_mx_fpx).lower()
+            use_dynamic_cga = str(operation.dynamic_cga).lower()
+            use_bias = str(False).lower()
+            swap_ab = str(operation.swap_ab).lower()
             # TODO Revert this once compiler bug is fixed so we can use template instead of macro again
             #         instantiation = f"""
             #         template void tma_warp_specialized_generic_moe_gemm_kernelLauncher<{arch_tag}, {act_tag}, {weight_tag}, {out_tag},
@@ -274,11 +289,12 @@ def instantiate_operation_tma_warp_specialized(operation):
             #                 (TmaWarpSpecializedGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);
             # """
             instantiation = f"""
-#if {guard_act} && {guard_weight}\n
+#if {guard_act} && {guard_weight}
         INSTANTIATE_TMA_WARP_SPECIALIZED_MOE_GEMM({arch_tag}, {act_tag}, {weight_tag}, {out_tag},
-                {epi_tag}, {epi_fusion}, {operation.cta_shape[0]}, {operation.cta_shape[1]}, {operation.cta_shape[2]}, {operation.cga_shape[0]}, {operation.cga_shape[1]}, {operation.cga_shape[2]}, {"true" if operation.is_mx_fpx else "false"}, false);\n
-#endif
-"""
+        {epi_sched}, {epi_tag}, {epi_fusion},
+        {operation.cta_shape[0]}, {operation.cta_shape[1]}, {operation.cta_shape[2]}, {operation.cga_shape[0]}, {operation.cga_shape[1]}, {operation.cga_shape[2]},
+        {is_mx_fpx}, {use_dynamic_cga}, {use_bias}, {swap_ab});
+#endif"""
     return instantiation
 
 
@@ -289,8 +305,7 @@ def instantiate_operation_sm80(operation):
 
     instantiation = f"""
             template void sm80_generic_fused_moe_gemm_kernelLauncher<{act_tag}, {weight_tag}, {operation.cta_shape[0]}, {operation.cta_shape[1]}, {operation.cta_shape[2]}, {operation.stage}, {epi_tag}>
-                    ({act_tag} const* A, {weight_tag} const* B, {act_tag} const* biases, bool bias_is_broadcast, {act_tag}* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);
-    """
+                    ({act_tag} const* A, {weight_tag} const* B, {act_tag} const* biases, bool bias_is_broadcast, {act_tag}* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);"""
     return instantiation
 
 
@@ -318,12 +333,12 @@ def get_file_content(launcher_inl_files, operations):
 {{
 namespace kernels
 {{
-namespace cutlass_kernels
+namespace cutlass_kernels_oss
 {{
 
 {instantiations}
 
-}} // namespace cutlass_kernels
+}} // namespace cutlass_kernels_oss
 }} // namespace kernels
 }} // namespace tensorrt_llm
 """
@@ -353,17 +368,28 @@ def write_file(launcher_inl_files, operations, output_file):
         f.write(content)
 
 
-from operator import mul, truediv
-
+def is_gemm_op_valid_sm100(op):
+    # TODO These are much more restricted than theory dictates, investigate if more can be enabled in future
+    tile_m, tile_n, _ = op.cta_shape
+    cga_m, cga_n, cga_k = op.cga_shape
 
-def elementwise(x, y, f):
-    return tuple(f(a, b) for (a, b) in zip(x, y))
+    if (
+        op.epi_fusion == TrtLlm_EpilogueFusion.epilogue_fusion_finalize
+        and op.epi_schedule != EpilogueScheduleType.PtrArrayTmaWarpSpecialized1Sm
+    ):
+        return False
 
+    # We use a runtime cluster shape for SM100, so we only use cluster shapes to distinguish between 1SM and 2SM variants.
+    if cga_m > 2 or cga_n != 1 or cga_k != 1:
+        return False
 
-def is_gemm_op_valid_sm100(op):
-    # TODO These are much more restricted than theory dictates, investigate if more can be enabled in future
-    tile_m, tile_n, _ = elementwise(op.cta_shape, op.cga_shape, truediv)
-    cga_m, cga_n, _ = op.cga_shape
+    if op.arch == 103:
+        return (
+            op.act_type == e2m1
+            and op.weight_type == e2m1
+            and tile_m == 128
+            and tile_n in [128, 256]
+        )
 
     # Default shapes
     # This is epilogue tile size. For two CTA this is actually size 128/256 for the MMA
@@ -372,23 +398,23 @@ def is_gemm_op_valid_sm100(op):
 
     # FP4 Has some much more limited sizes
     if op.act_type == e2m1 or op.weight_type == e2m1:
-        # TODO 128x256x256 FP4 compiles but crashes
-        # if tile_n % 64 != 0 or tile_n < 128:
-        #     return False
         if tile_n not in [64, 128, 256] or tile_m != 128:
             return False
+        # TODO Revert this once cutlass adds support for blockscaled + no smem
+        if (
+            op.arch == 100
+            and op.epi_schedule == EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized1Sm
+        ):
+            return False
 
     # Shapes for fp8 small N shapes
     if (
-        op.act_type == DataType.e4m3
+        (op.act_type == DataType.e4m3)
         and (tile_n == 16 or tile_n == 8)
         and (cga_m == 1 and cga_n == 1)
     ):
-        # todo: double check why this is disable in CUTLASS backend. @yuhan
-        if tile_m == 128 and tile_n == 8:
-            return False
-        else:
-            return True
+        # todo: double check why tile_n = 8 is disabled in CUTLASS backend. @yuhan
+        return tile_m != 128 or tile_n % 16 == 0
 
     # Default alignment requirements
     if tile_n % 32 != 0 or tile_n < 32 or tile_n > 256:
@@ -427,7 +453,10 @@ def is_grouped_gemm_op_valid(op):
     if op.epi_tag != TrtLlm_EpilogueTag.epilogue_op_default:
         return False
 
-    if op.epi_schedule != EpilogueScheduleType.NoSmemWarpSpecialized:
+    if (
+        op.epi_schedule is not None
+        and op.epi_schedule != EpilogueScheduleType.NoSmemWarpSpecialized
+    ):
         return False
 
     if op.mainloop_schedule not in [
@@ -543,14 +572,30 @@ def generate_sm90_grouped_gemm_operations(is_arch_enabled):
         TrtLlm_EpilogueFusion.epilogue_fusion_finalize,
     ]
 
+    swap_ab = [True, False]
+
     cga_shapes = product([1, 2], [1, 2], [1])
 
     partial_args = product(
-        supported_dtypes, quant_ops, epi_tags, epi_fusions, cta_shapes_mn, cga_shapes
+        supported_dtypes,
+        quant_ops,
+        epi_tags,
+        epi_fusions,
+        cta_shapes_mn,
+        cga_shapes,
+        swap_ab,
     )
 
     operations = list()
-    for dtype, quant_op, epi_tag, epi_fusion, cta_shape_mn, cga_shape in partial_args:
+    for (
+        dtype,
+        quant_op,
+        epi_tag,
+        epi_fusion,
+        cta_shape_mn,
+        cga_shape,
+        swap_ab,
+    ) in partial_args:
         max_k_bits = 128 * 8
         cta_shape_k = max_k_bits // GetDataTypeBits(dtype)
         cta_shape_mnk = cta_shape_mn + (cta_shape_k,)
@@ -560,7 +605,7 @@ def generate_sm90_grouped_gemm_operations(is_arch_enabled):
             if dtype != DataType.e4m3
             else KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum
         )
-        epi_schedule = EpilogueScheduleType.NoSmemWarpSpecialized
+        epi_schedule = None
 
         otypes = [dtype]
         if dtype == DataType.e4m3:
@@ -584,6 +629,7 @@ def generate_sm90_grouped_gemm_operations(is_arch_enabled):
                 mainloop_schedule,
                 epi_schedule,
                 epi_fusion,
+                swap_ab=swap_ab,
             )
 
             if is_op_valid(moe_gemm_operation):
@@ -693,8 +739,6 @@ def calc_shape_mnk_sm100_grouped_gemm(cta_shape_mn, dtype):
     cta_shape_k = max_k_bits // GetDataTypeBits(dtype)
     if dtype == DataType.e4m3 and (cta_shape_mn[1] == 8):
         cta_shape_k = 256
-    if dtype == DataType.e4m3 and (cta_shape_mn[1] == 16):
-        cta_shape_k = 128
     return cta_shape_mn + (cta_shape_k,)
 
 
@@ -702,7 +746,7 @@ def generate_sm120_grouped_gemm_operations(is_arch_enabled):
     if not is_arch_enabled:
         return []
     arch = 120
-    supported_dtypes = [e2m1]
+    supported_dtypes = [e2m1, (DataType.e4m3, e2m1)]
     quant_ops = [TrtLlm_QuantOp.none]
     epi_tags = [TrtLlm_EpilogueTag.epilogue_op_default]
     cta_shapes_mnk = [
@@ -717,45 +761,71 @@ def generate_sm120_grouped_gemm_operations(is_arch_enabled):
 
     epi_fusions = [
         TrtLlm_EpilogueFusion.epilogue_fusion_none,
-        # TrtLlm_EpilogueFusion.epilogue_fusion_finalize
+        TrtLlm_EpilogueFusion.epilogue_fusion_finalize,
     ]
 
     cga_shapes = [[1, 1, 1]]
 
+    swap_ab = [True, False]
+
     partial_args = product(
-        supported_dtypes, quant_ops, epi_tags, epi_fusions, cta_shapes_mnk, cga_shapes
+        supported_dtypes,
+        quant_ops,
+        epi_tags,
+        epi_fusions,
+        cta_shapes_mnk,
+        cga_shapes,
+        swap_ab,
     )
 
     operations = list()
-    for dtype, quant_op, epi_tag, epi_fusion, cta_shape_mnk, cga_shape in partial_args:
-        cga_tile_shape_mnk = elementwise(cta_shape_mnk, cga_shape, mul)
-
+    for (
+        dtype,
+        quant_op,
+        epi_tag,
+        epi_fusion,
+        cta_shape_mnk,
+        cga_shape,
+        swap_ab,
+    ) in partial_args:
         # Ignored
         mainloop_schedule = KernelScheduleType.TmaWarpSpecializedCooperative
-        epi_schedule = EpilogueScheduleType.NoSmemWarpSpecialized
+        epi_schedule = None
 
-        otypes = [dtype]
-        if dtype in [DataType.e4m3, e2m1]:
+        if isinstance(dtype, tuple):
+            act_type, weight_type = dtype
+        else:
+            act_type, weight_type = dtype, dtype
+
+        # Minimal filter: for mixed FP8xFP4 on SM120, only emit 128x128x128
+        if act_type == DataType.e4m3 and weight_type == e2m1:
+            if cta_shape_mnk != [128, 128, 128]:
+                continue
+
+        otypes = [act_type]
+        if act_type in [DataType.e4m3, e2m1]:
             otypes = [DataType.f16, DataType.bf16]
 
         for otype in otypes:
             moe_gemm_operation = TrtLlm_GemmLauncher(
                 GemmKind.Grouped,
                 arch,
-                dtype,
-                dtype,
-                dtype,
-                dtype,
+                act_type,
+                weight_type,
+                act_type,
+                act_type,
                 otype,
                 quant_op,
                 epi_tag,
-                cga_tile_shape_mnk,
+                cta_shape_mnk,
                 warp_shape,
                 stages,
                 cga_shape,
                 mainloop_schedule,
                 epi_schedule,
                 epi_fusion,
+                is_mx_fpx=(act_type == DataType.e4m3 and weight_type == e2m1),
+                swap_ab=swap_ab,
             )
 
             operations.append(moe_gemm_operation)
@@ -767,10 +837,9 @@ def generate_sm120_operations(is_arch_enabled):
     return operations
 
 
-def generate_sm100_grouped_gemm_operations(is_arch_enabled):
+def generate_sm100_grouped_gemm_operations(is_arch_enabled, arch):
     if not is_arch_enabled:
         return []
-    arch = 100
     supported_dtypes = [
         DataType.f16,
         DataType.bf16,
@@ -782,7 +851,7 @@ def generate_sm100_grouped_gemm_operations(is_arch_enabled):
     quant_ops = [TrtLlm_QuantOp.none]
     epi_tags = [TrtLlm_EpilogueTag.epilogue_op_default]
     cta_shapes_m = [64, 128]
-    cta_shapes_n = [8, 16, 32, 64, 128, 256]
+    cta_shapes_n = [8, 16, 32, 64, 128, 192, 256]
     cta_shapes_mn = product(cta_shapes_m, cta_shapes_n)
 
     warp_shape = [0, 0, 0]  # ignored except for naming
@@ -790,28 +859,55 @@ def generate_sm100_grouped_gemm_operations(is_arch_enabled):
 
     epi_fusions = [
         TrtLlm_EpilogueFusion.epilogue_fusion_none,
-        # TrtLlm_EpilogueFusion.epilogue_fusion_finalize
+        TrtLlm_EpilogueFusion.epilogue_fusion_finalize,
     ]
 
-    cga_shapes = list(product([1, 2], [1, 2], [1]))
+    # Some shapes for SM100 are better with NoSmem, note the kernel will internally map to the 1 or 2 SM variants based on the cga_shape[0]
+    epi_schedules = [
+        EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized1Sm,
+        EpilogueScheduleType.PtrArrayTmaWarpSpecialized1Sm,
+    ]
+
+    # We will use dynamic cluster shapes for SM100, so we only need to indicate if we are using 1 or 2 SM version
+    cga_shapes = [(1, 1, 1), (2, 1, 1)]
+
+    swap_ab = [True, False]
+
+    dynamic_cga = [True, False]
 
     partial_args = product(
-        supported_dtypes, quant_ops, epi_tags, epi_fusions, cta_shapes_mn, cga_shapes
+        supported_dtypes,
+        quant_ops,
+        epi_tags,
+        epi_fusions,
+        cta_shapes_mn,
+        cga_shapes,
+        epi_schedules,
+        dynamic_cga,
+        swap_ab,
     )
 
     operations = list()
-    for dtype, quant_op, epi_tag, epi_fusion, cta_shape_mn, cga_shape in partial_args:
+    for (
+        dtype,
+        quant_op,
+        epi_tag,
+        epi_fusion,
+        cta_shape_mn,
+        cga_shape,
+        epi_schedule,
+        dynamic_cga,
+        swap_ab,
+    ) in partial_args:
         if isinstance(dtype, tuple):
             dtype, weight_type = dtype
         else:
             weight_type = dtype
 
         cta_shape_mnk = calc_shape_mnk_sm100_grouped_gemm(cta_shape_mn, dtype)
-        cga_tile_shape_mnk = elementwise(cta_shape_mnk, cga_shape, mul)
 
         # Ignored
         mainloop_schedule = KernelScheduleType.TmaWarpSpecializedCooperative
-        epi_schedule = EpilogueScheduleType.NoSmemWarpSpecialized
 
         otypes = [dtype]
         if dtype in [DataType.e4m3, e2m1]:
@@ -828,7 +924,7 @@ def generate_sm100_grouped_gemm_operations(is_arch_enabled):
                 otype,
                 quant_op,
                 epi_tag,
-                cga_tile_shape_mnk,
+                cta_shape_mnk,
                 warp_shape,
                 stages,
                 cga_shape,
@@ -836,6 +932,8 @@ def generate_sm100_grouped_gemm_operations(is_arch_enabled):
                 epi_schedule,
                 epi_fusion,
                 is_mx_fpx=(dtype == DataType.e4m3 and weight_type == e2m1),
+                dynamic_cga=dynamic_cga,
+                swap_ab=swap_ab,
             )
 
             if is_op_valid(moe_gemm_operation):
@@ -843,8 +941,13 @@ def generate_sm100_grouped_gemm_operations(is_arch_enabled):
     return operations
 
 
+def generate_sm103_operations(is_arch_enabled):
+    operations = generate_sm100_grouped_gemm_operations(is_arch_enabled, 103)
+    return operations
+
+
 def generate_sm100_operations(is_arch_enabled):
-    operations = generate_sm100_grouped_gemm_operations(is_arch_enabled)
+    operations = generate_sm100_grouped_gemm_operations(is_arch_enabled, 100)
     return operations
 
 
@@ -908,18 +1011,25 @@ def generate_gemm_operations(output_dir, architectures):
         (GemmKind.Gemm, 90): [fpA_intB_inl],
         (GemmKind.Grouped, 90): [moe_gemm_inl],
         (GemmKind.Grouped, 100): [moe_gemm_inl],
+        (GemmKind.Grouped, 103): [moe_gemm_inl],
         (GemmKind.Grouped, 120): [moe_gemm_inl],
         (GemmKind.Grouped, 80): [sm80_moe_gemm_inl],
     }
 
     def has_arch(sm):
-        return f"{sm}" in arches or f"{sm}-real" in arches
+        return (
+            f"{sm}" in arches
+            or f"{sm}-real" in arches
+            or f"{sm}f-real" in arches
+            or f"{sm}f" in arches
+        )
 
     # The goal here is to group kernels with common instantiations together in order to reduce template instantiation overheads.
     # Template instantiation dominates the time in a compilation unit, so it is the most important factor to improve.
     operations = []
     operations += generate_sm120_operations(has_arch(120) or has_arch(121))
-    operations += generate_sm100_operations(has_arch(100))
+    operations += generate_sm103_operations(has_arch(103))
+    operations += generate_sm100_operations(has_arch(100) or has_arch(103))
     operations += generate_sm90_operations(has_arch(90))
     operations += generate_sm80_operations(has_arch(80) or has_arch(89))
 
diff --git a/flashinfer/jit/xqa.py b/flashinfer/jit/xqa.py
index 86fa3f7895..cb9d823708 100644
--- a/flashinfer/jit/xqa.py
+++ b/flashinfer/jit/xqa.py
@@ -25,13 +25,9 @@
 
 xqa_nvcc_flags = [
     "-DNDEBUG=1",
-    "-DUSE_PAGED_KV_CACHE=1",
-    "-DPAGED_KV_CACHE_LAYOUT=1",
     "-DBEAM_WIDTH=1",
     "-DUSE_INPUT_KV=0",
     "-DUSE_CUSTOM_BARRIER=1",
-    "-DLOW_PREC_OUTPUT=0",
-    "-DSPEC_DEC=0",
 ]
 
 
@@ -42,6 +38,8 @@ def gen_xqa_module(
     head_dim: int,
     head_group_ratio: int,
     use_sliding_window: bool,
+    output_dtype: torch.dtype,
+    q_seq_len: int = 1,
 ) -> JitSpec:
     if input_dtype == torch.float16:
         flag_input_dtype = ["-DINPUT_FP16=1", "-DDTYPE=__half"]
@@ -78,6 +76,21 @@ def gen_xqa_module(
     else:
         flag_sliding_window = ["-DSLIDING_WINDOW=0"]
 
+    if output_dtype == torch.float8_e4m3fn:
+        flag_low_prec_output = ["-DLOW_PREC_OUTPUT=1"]
+    else:
+        flag_low_prec_output = ["-DLOW_PREC_OUTPUT=0"]
+
+    if q_seq_len > 1:
+        use_spec_dec = True
+        if q_seq_len * head_group_ratio <= 32:
+            flag_spec_dec = ["-DSPEC_DEC=1", f"-DSPEC_Q_SEQ_LEN={q_seq_len}"]
+        else:
+            flag_spec_dec = ["-DSPEC_DEC=1"]
+    else:
+        flag_spec_dec = ["-DSPEC_DEC=0"]
+        use_spec_dec = False
+
     compilation_context = CompilationContext()
     nvcc_flags = compilation_context.get_nvcc_flags_list(
         supported_major_versions=[9, 10, 11, 12]
@@ -86,15 +99,25 @@ def gen_xqa_module(
 
     flag_mla_wrapper = ["-DMLA_WRAPPER=0"]
 
+    sources = [
+        jit_env.FLASHINFER_CSRC_DIR / "xqa/mha.cu",
+        jit_env.FLASHINFER_CSRC_DIR / "xqa/xqa_wrapper.cu",
+        jit_env.FLASHINFER_CSRC_DIR / "flashinfer_xqa_binding.cu",
+    ]
+
+    target_archs = compilation_context.TARGET_CUDA_ARCHS
+
+    has_sm90 = any(major == 9 for major, minor in target_archs)
+    if has_sm90:
+        sources.append(jit_env.FLASHINFER_CSRC_DIR / "xqa/mha_sm90.cu")
+        sources.append(jit_env.FLASHINFER_CSRC_DIR / "xqa/tensorMap.cpp")
+        flag_sm90_mha = ["-DUSE_SM90_MHA=1"]
+    else:
+        flag_sm90_mha = ["-DUSE_SM90_MHA=0"]
+
     return gen_jit_spec(
-        f"xqa_input_{filename_safe_dtype_map[input_dtype]}_kv_cache_{filename_safe_dtype_map[kv_cache_dtype]}_page_size_{page_size}_head_dim_{head_dim}_head_group_ratio_{head_group_ratio}_use_sliding_window_{use_sliding_window}",
-        [
-            jit_env.FLASHINFER_CSRC_DIR / "xqa/mha.cu",
-            jit_env.FLASHINFER_CSRC_DIR / "xqa/mha_sm90.cu",
-            jit_env.FLASHINFER_CSRC_DIR / "xqa/tensorMap.cpp",
-            jit_env.FLASHINFER_CSRC_DIR / "xqa/xqa_wrapper.cu",
-            jit_env.FLASHINFER_CSRC_DIR / "flashinfer_xqa_binding.cu",
-        ],
+        f"xqa_input_{filename_safe_dtype_map[input_dtype]}_kv_cache_{filename_safe_dtype_map[kv_cache_dtype]}_output_{filename_safe_dtype_map[output_dtype]}_page_size_{page_size}_head_dim_{head_dim}_head_group_ratio_{head_group_ratio}_use_sliding_window_{use_sliding_window}_use_spec_dec_{use_spec_dec}_spec_q_seq_len_{q_seq_len}",
+        sources,
         extra_cuda_cflags=xqa_nvcc_flags
         + sm_nvcc_flags
         + flag_tokens_per_page
@@ -103,9 +126,11 @@ def gen_xqa_module(
         + flag_kv_cache_dtype
         + flag_head_group_ratio
         + flag_sliding_window
-        + flag_mla_wrapper,
+        + flag_low_prec_output
+        + flag_spec_dec
+        + flag_mla_wrapper
+        + flag_sm90_mha,
         extra_ldflags=["-lcuda"],  # Add CUDA Driver API library
-        extra_cflags=["-DPAGED_KV_CACHE_LAYOUT=1"],
     )
 
 
@@ -164,5 +189,4 @@ def gen_xqa_module_mla(
         + flag_sliding_window
         + flag_mla_wrapper,
         extra_ldflags=["-lcuda"],  # Add CUDA Driver API library
-        extra_cflags=["-DPAGED_KV_CACHE_LAYOUT=1"],
     )
diff --git a/flashinfer/mla.py b/flashinfer/mla.py
index 490ae7edf0..3b59ad99ce 100644
--- a/flashinfer/mla.py
+++ b/flashinfer/mla.py
@@ -15,13 +15,23 @@
 """
 
 import functools
-from typing import Literal, Optional, Tuple, Union, overload
+from typing import List, Literal, Optional, Tuple, Union, overload
 
 import torch
 
-from .jit import gen_batch_mla_module
+from .api_logging import flashinfer_api
+from .jit import gen_batch_mla_module, gen_trtllm_gen_fmha_module, setup_cubin_loader
 from .jit.mla import gen_mla_module
-from .utils import MaskMode, check_shape_dtype_device, determine_mla_backend
+from .utils import (
+    MaskMode,
+    check_shape_dtype_device,
+    determine_mla_backend,
+    device_support_pdl,
+    get_compute_capability,
+    get_device_sm_count,
+    log2e,
+)
+from .xqa import xqa_mla
 
 
 def _check_cutlass_shape(q_nope_pe, ckv_kpe_cache, kv_len, page_table):
@@ -53,6 +63,64 @@ def _check_cutlass_shape(q_nope_pe, ckv_kpe_cache, kv_len, page_table):
         )
 
 
+def _check_trtllm_gen_mla_shape(
+    query,
+    kv_cache,
+    qk_nope_head_dim,
+    kv_lora_rank,
+    qk_rope_head_dim,
+    sparse_mla_top_k,
+    page_table,
+    page_size,
+):
+    if query.ndim != 4:
+        raise ValueError(f"Expected query.ndim == 4, got {query.ndim}")
+    if kv_cache.ndim != 4:
+        raise ValueError(f"Expected kv_cache.ndim == 4, got {kv_cache.ndim}")
+    if qk_nope_head_dim != 128:
+        raise ValueError(f"Expected qk_nope_head_dim == 128, got {qk_nope_head_dim}")
+    if kv_lora_rank != 512:
+        raise ValueError(f"Expected kv_lora_rank == 512, got {kv_lora_rank}")
+    if qk_rope_head_dim != 64:
+        raise ValueError(f"Expected qk_rope_head_dim == 64, got {qk_rope_head_dim}")
+
+    B_q, Q_len, H, D_q = query.shape
+    D_ckv = kv_cache.shape[3]
+    # if H != 128:
+    #     raise ValueError(f"Expected 128 heads for query, got {H}")
+    # todo(Yingyi): should we check num_heads == 128? Is this deepseek only?
+    if D_q != D_ckv or D_q != 576:
+        raise ValueError(
+            f"Expected head dim 576 for query and kv_cache, got {D_q} and {D_ckv}"
+        )
+
+    if sparse_mla_top_k > 0:
+        page_table_shape = page_table.shape
+        if page_table_shape != (B_q, Q_len, sparse_mla_top_k):
+            raise ValueError(
+                f"Expected page_table.shape == (B_q, Q_len, sparse_mla_top_k), got {page_table_shape}"
+            )
+    else:
+        B_block_table, block_num = page_table.shape
+        block_size = page_size
+        if B_q != B_block_table:
+            raise ValueError(
+                f"Expected batch size {B_q} for query and block_table, got {B_q} and {B_block_table}"
+            )
+        if block_num % (128 / block_size) != 0:
+            raise ValueError(
+                f"Expected block_num % (128 / block_size) == 0, got {block_num=} and {block_size=}"
+            )
+
+
+@functools.cache
+def get_trtllm_gen_fmha_module():
+    mod = gen_trtllm_gen_fmha_module()
+    op = mod.build_and_load()
+    setup_cubin_loader(mod.get_library_path())
+    return op
+
+
 @functools.cache
 def get_mla_module():
     return gen_mla_module().build_and_load()
@@ -129,6 +197,7 @@ class BatchMLAPagedAttentionWrapper:
     torch.Size([114, 128, 512])
     """
 
+    @flashinfer_api
     def __init__(
         self,
         float_workspace_buffer: torch.Tensor,
@@ -199,6 +268,7 @@ def __init__(
         else:
             self._backend = backend
 
+    @flashinfer_api
     def plan(
         self,
         qo_indptr: torch.Tensor,
@@ -248,18 +318,6 @@ def plan(
         use_profiler : bool, optional
             Whether to enable intra-kernel profiler, default is False.
         """
-
-        for tensor, name in [
-            (kv_len_arr, "kv_len_arr"),
-            (kv_indptr, "kv_indptr"),
-            (qo_indptr, "qo_indptr"),
-            (kv_indices, "kv_indices"),
-        ]:
-            if tensor.dtype != torch.int32:
-                raise ValueError(
-                    f"Expected {name}.dtype == torch.int32, got {tensor.dtype}"
-                )
-
         self._cached_module = get_batch_mla_module(
             self._backend,
             q_data_type,
@@ -314,6 +372,7 @@ def run(
         profiler_buffer: Optional[torch.Tensor] = None,
         kv_len: Optional[torch.Tensor] = None,
         page_table: Optional[torch.Tensor] = None,
+        return_lse_base_on_e: bool = False,
     ) -> torch.Tensor: ...
 
     @overload
@@ -329,8 +388,10 @@ def run(
         profiler_buffer: Optional[torch.Tensor] = None,
         kv_len: Optional[torch.Tensor] = None,
         page_table: Optional[torch.Tensor] = None,
+        return_lse_base_on_e: bool = False,
     ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
+    @flashinfer_api
     def run(
         self,
         q_nope: torch.Tensor,
@@ -343,6 +404,7 @@ def run(
         profiler_buffer: Optional[torch.Tensor] = None,
         kv_len: Optional[torch.Tensor] = None,
         page_table: Optional[torch.Tensor] = None,
+        return_lse_base_on_e: bool = False,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         r"""Run the MLA attention computation.
 
@@ -441,7 +503,290 @@ def run(
             num_heads,
             page_size,
             sm_scale,
+            return_lse_base_on_e,
             *profiler_args,
         )
 
         return (out, lse) if return_lse else out
+
+
+@flashinfer_api
+def trtllm_batch_decode_with_kv_cache_mla(
+    query: torch.Tensor,
+    kv_cache: torch.Tensor,
+    workspace_buffer: torch.Tensor,
+    qk_nope_head_dim: int,
+    kv_lora_rank: int,
+    qk_rope_head_dim: int,
+    block_tables: torch.Tensor,
+    seq_lens: torch.Tensor,
+    max_seq_len: int,
+    sparse_mla_top_k: int = 0,
+    out: Optional[torch.Tensor] = None,
+    bmm1_scale: Union[float, torch.Tensor] = 1.0,
+    bmm2_scale: Union[float, torch.Tensor] = 1.0,
+    sinks: Optional[List[torch.Tensor]] = None,
+    enable_pdl: bool = None,
+    backend: str = "auto",
+) -> torch.Tensor:
+    """
+    Parameters
+    ----------
+    query: [batch_size, q_len_per_request, num_heads, head_dim_qk], head_dim_qk = qk_nope_head_dim (kv_lora_rank) + qk_rope_head_dim, should be concated q_nope + q_rope; q_len_per_request is the MTP query length.
+    kv_cache: [num_pages, page_size, head_dim_ckv + head_dim_kpe], should be concated ckv_cache + kpe_cache
+    workspace_buffer: [num_semaphores, 4], used for multi_block mode. Must be initialized to 0 for its first use.
+    qk_nope_head_dim: qk_nope_head_dim, must be 128
+    kv_lora_rank: kv_lora_rank, must be 512
+    qk_rope_head_dim: qk_rope_head_dim, must be 64
+    sparse_mla_top_k: sparse MLA top k, must be 0 for non-sparse MLA.
+    block_tables: page_table of kv cache, [batch_size, num_pages]
+    seq_lens: query_len
+    max_seq_len: max sequence length for kv_cache
+    out: output tensor, if not provided, will be allocated internally
+    bmm1_scale: fused scale for mla bmm1 input.
+        when using trtllm-gen backend, it can be a torch.Tensor with dtype torch.float32.
+    bmm2_scale: fused scale for mla bmm2 input.
+        when using trtllm-gen backend, it can be a torch.Tensor with dtype torch.float32.
+    sinks: additional value per head in the denominator of the softmax.
+    backend : str = "auto"
+        The implementation backend, could be ``auto``/``xqa`` or ``trtllm-gen``. Defaults to ``auto``.
+        When set to ``auto``, the backend will be chosen based on the device architecture and kernel availability.
+        For sm_100 and sm_103 (blackwell architecture), ``auto`` will choose ``trtllm-gen`` backend.
+        For sm_120 (blackwell architecture), ``auto`` will choose ``xqa`` backend.
+
+    Note
+    ----
+    In MLA, the actual BMM1 and BMM2 scales applied would be fused as:
+    bmm1_scale = q_scale * k_scale * sm_scale / (head_dim_qk ** 0.5)
+    bmm2_scale = v_scale * o_scale
+    or,
+    bmm1_scale = torch.Tensor([q_scale * k_scale * sm_scale / (head_dim_qk ** 0.5))
+    bmm2_scale = torch.Tensor([v_scale * o_scale])
+
+    The two scale factors should be static constant for cuda graph capture.
+    Either (bmm1_scale, bmm2_scale) or (bmm1_scale_log2_tensor, bmm2_scale_tensor) should be provided.
+
+    For static constant scale factors, the scale factors should be provided as float.
+        - (bmm1_scale, bmm2_scale)
+    For on-device fused scale tensors, which could dynamically change, the scale factors should be provided as torch.Tensor.
+        - (bmm1_scale_log2_tensor, bmm2_scale_tensor)
+        - Currently, only fp8 tensor core operation supports this mode.
+    When both are provided, the dynamic scale factor tensors will be used.
+    """
+    if backend == "auto":
+        backend = (
+            "trtllm-gen" if get_compute_capability(query.device)[0] == 10 else "xqa"
+        )
+    if isinstance(bmm1_scale, torch.Tensor):
+        assert bmm1_scale.dtype == torch.float32
+        bmm1_scale = bmm1_scale * log2e
+    if isinstance(bmm2_scale, torch.Tensor):
+        assert bmm2_scale.dtype == torch.float32
+    if backend == "xqa":
+        if (
+            get_compute_capability(query.device)[0] != 12
+            or query.dtype != torch.float8_e4m3fn
+            or kv_cache.dtype != torch.float8_e4m3fn
+        ):
+            raise ValueError(
+                f"XQA MLA only supports fp8 operation on SM120 GPUs, got {query.dtype} and {kv_cache.dtype}"
+            )
+        if sinks is not None:
+            raise ValueError("XQA MLA does not support sinks")
+        if query.size(1) != 1:
+            raise ValueError(
+                f"XQA MLA only supports q_len_per_request == 1, got {query.size(1)}"
+            )
+        return xqa_batch_decode_with_kv_cache_mla(
+            query,
+            kv_cache,
+            workspace_buffer,
+            qk_nope_head_dim,
+            kv_lora_rank,
+            qk_rope_head_dim,
+            block_tables,
+            seq_lens,
+            max_seq_len,
+            out,
+            bmm1_scale,
+            bmm2_scale,
+            sinks,
+            enable_pdl,
+        )
+    elif backend == "trtllm-gen":
+        enable_pdl = (
+            device_support_pdl(query.device) if enable_pdl is None else enable_pdl
+        )
+        run_func = get_trtllm_gen_fmha_module().trtllm_paged_attention_decode
+        sm_count = get_device_sm_count(query.device)
+
+        block_size = kv_cache.size(-2)
+        if (
+            block_size != 32 and block_size != 64
+        ):  # todo(Yingyi): add support for more block sizes?
+            raise ValueError(f"Supported block_size are 32 and 64, got {block_size}")
+
+        _check_trtllm_gen_mla_shape(
+            query,
+            kv_cache,
+            qk_nope_head_dim,
+            kv_lora_rank,
+            qk_rope_head_dim,
+            sparse_mla_top_k,
+            block_tables,
+            block_size,
+        )
+
+        if out is None:
+            out_shape = query.shape[:-1] + (kv_lora_rank,)
+            out = torch.empty(out_shape, dtype=torch.bfloat16, device=query.device)
+        else:
+            batch_size, _, num_q_heads, _ = query.shape
+            check_shape_dtype_device(
+                out,
+                [batch_size, num_q_heads, kv_lora_rank],
+                torch.bfloat16,
+                query.device,
+                "out",
+            )
+
+        run_func(
+            out,
+            None,  # fp4 output not supported in wrapper api yet.
+            query,
+            kv_cache,
+            kv_cache,
+            workspace_buffer,
+            block_tables,
+            seq_lens,
+            max_seq_len,
+            bmm1_scale,
+            bmm2_scale,
+            -1,  # o_sf_scale
+            -1,  # o_sf_vec_size
+            0,  # o_sf_start_index
+            -1,  # window_left
+            sparse_mla_top_k,
+            sm_count,
+            enable_pdl,
+            workspace_buffer.numel() * workspace_buffer.element_size(),
+            sinks,
+        )
+
+        return out
+    else:
+        raise ValueError(f"Backend {backend} not supported")
+
+
+@flashinfer_api
+def xqa_batch_decode_with_kv_cache_mla(
+    query: torch.Tensor,
+    kv_cache: torch.Tensor,
+    workspace_buffer: torch.Tensor,
+    qk_nope_head_dim: int,
+    kv_lora_rank: int,
+    qk_rope_head_dim: int,
+    block_tables: torch.Tensor,
+    seq_lens: torch.Tensor,
+    max_seq_len: int,
+    out: Optional[torch.Tensor] = None,
+    bmm1_scale: Union[float, torch.Tensor] = 1.0,
+    bmm2_scale: Union[float, torch.Tensor] = 1.0,
+    sinks: Optional[List[torch.Tensor]] = None,
+    enable_pdl: bool = None,
+) -> torch.Tensor:
+    """
+    Parameters:
+    query: [batch_size, q_len_per_request, num_heads, head_dim_qk], head_dim_qk = qk_nope_head_dim (kv_lora_rank) + qk_rope_head_dim, should be concated q_nope + q_rope; q_len_per_request is the MTP query length.
+    kv_cache: [num_pages, page_size, head_dim_ckv + head_dim_kpe], should be concated ckv_cache + kpe_cache
+    workspace_buffer: torch.Tensor. Must be initialized to 0 for its first use.
+    qk_nope_head_dim: qk_nope_head_dim, must be 128
+    kv_lora_rank: kv_lora_rank, must be 512
+    qk_rope_head_dim: qk_rope_head_dim, must be 64
+    block_tables: page_table of kv cache, [batch_size, num_pages]
+    seq_lens: query_len
+    max_seq_len: max sequence length for kv_cache
+    out: output tensor, if not provided, will be allocated internally
+    bmm1_scale: fused scale for mla bmm1 input. Can be a float or a torch.Tensor.
+    bmm2_scale: fused scale for mla bmm2 input. Can be a float or a torch.Tensor.
+    sinks: additional value per head in the denominator of the softmax.
+
+    Note:
+    In MLA, the actual BMM1 and BMM2 scales applied would be fused as:
+    bmm1_scale = q_scale * k_scale * sm_scale / (head_dim_qk ** 0.5)
+    bmm2_scale = v_scale * o_scale
+
+    The two scale factors should be static constant for cuda graph capture.
+    Either (bmm1_scale, bmm2_scale) or (bmm1_scale_log2_tensor, bmm2_scale_tensor) should be provided.
+
+    For static constant scale factors, the scale factors should be provided as float.
+        - (bmm1_scale, bmm2_scale)
+    For on-device fused scale tensors, which could dynamically change, the scale factors should be provided as torch.Tensor.
+        - (bmm1_scale_log2_tensor, bmm2_scale_tensor)
+        - Currently, only fp8 tensor core operation supports this mode.
+    When both are provided, the dynamic scale factor tensors will be used.
+    """
+    enable_pdl = device_support_pdl(query.device) if enable_pdl is None else enable_pdl
+    sm_count = get_device_sm_count(query.device)
+
+    block_size = kv_cache.size(-2)
+    q_len_per_request = query.size(1)
+    if q_len_per_request != 1:
+        raise ValueError(
+            f"XQA MLA only supports q_len_per_request == 1, got {q_len_per_request}"
+        )
+    if query.dtype != torch.float8_e4m3fn or kv_cache.dtype != torch.float8_e4m3fn:
+        raise ValueError(
+            f"XQA MLA only supports fp8 tensor core operation, got {query.dtype} and {kv_cache.dtype}"
+        )
+    if sinks is not None:
+        raise ValueError("XQA MLA does not support sinks")
+
+    _check_trtllm_gen_mla_shape(
+        query,
+        kv_cache,
+        qk_nope_head_dim,
+        kv_lora_rank,
+        qk_rope_head_dim,
+        0,  # sparse_mla_top_k
+        block_tables,
+        block_size,
+    )
+
+    if out is None:
+        out_shape = query.shape[:-1] + (kv_lora_rank,)
+        out = torch.empty(out_shape, dtype=torch.bfloat16, device=query.device)
+    else:
+        batch_size, _, num_q_heads, _ = query.shape
+        check_shape_dtype_device(
+            out,
+            [batch_size, num_q_heads, kv_lora_rank],
+            torch.bfloat16,
+            query.device,
+            "out",
+        )
+
+    workspace_u8 = workspace_buffer.view(torch.uint8)
+    semaphore = workspace_u8[: 8 * 1024 * 1024]  # reserve 8MB for semaphore
+    scratch = workspace_u8[8 * 1024 * 1024 :]
+    # This can not be replaced by kv_cache.transpose(1, 2) because the stride is not the same
+    kv_cache_new = kv_cache.squeeze(1).unsqueeze(2)
+    seq_lens_new = seq_lens.unsqueeze(1)
+
+    xqa_mla(
+        query,
+        kv_cache_new,
+        kv_cache_new,
+        block_tables,
+        seq_lens_new,
+        out,
+        scratch,
+        semaphore,
+        block_size,
+        q_scale=bmm1_scale,
+        kv_scale=bmm2_scale,
+        sm_count=sm_count,
+        enable_pdl=enable_pdl,
+    )
+
+    return out
diff --git a/flashinfer/norm.py b/flashinfer/norm.py
index 7318974215..5a697186a6 100644
--- a/flashinfer/norm.py
+++ b/flashinfer/norm.py
@@ -19,6 +19,7 @@
 
 import torch
 
+from .api_logging import flashinfer_api
 from .jit.norm import gen_norm_module
 from .utils import device_support_pdl, register_custom_op, register_fake_op
 
@@ -28,6 +29,7 @@ def get_norm_module():
     return gen_norm_module().build_and_load()
 
 
+@flashinfer_api
 def rmsnorm(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -90,6 +92,7 @@ def _rmsnorm_fake(
     pass
 
 
+@flashinfer_api
 @register_custom_op("flashinfer::fused_add_rmsnorm", mutates_args=("input", "residual"))
 def fused_add_rmsnorm(
     input: torch.Tensor,
@@ -136,6 +139,7 @@ def _fused_add_rmsnorm_fake(
     pass
 
 
+@flashinfer_api
 def gemma_rmsnorm(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -198,6 +202,7 @@ def _gemma_rmsnorm_fake(
     pass
 
 
+@flashinfer_api
 @register_custom_op(
     "flashinfer::gemma_fused_add_rmsnorm", mutates_args=("input", "residual")
 )
@@ -246,6 +251,7 @@ def _gemma_fused_add_rmsnorm_fake(
     pass
 
 
+@flashinfer_api
 @register_custom_op("flashinfer::layernorm", mutates_args=())
 def layernorm(
     input: torch.Tensor,
diff --git a/flashinfer/page.py b/flashinfer/page.py
index 069303e501..5a000c3a15 100644
--- a/flashinfer/page.py
+++ b/flashinfer/page.py
@@ -19,6 +19,7 @@
 
 import torch
 
+from .api_logging import flashinfer_api
 from .jit.page import gen_page_module
 from .utils import (
     TensorLayout,
@@ -34,42 +35,6 @@ def get_page_module():
     return gen_page_module().build_and_load()
 
 
-def block_sparse_indices_to_vector_sparse_offsets(
-    block_sparse_indices: torch.Tensor,
-    block_sparse_indptr: torch.Tensor,
-    vector_sparse_offsets: torch.Tensor,
-    vector_sparse_indptr: torch.Tensor,
-    kv_lens: torch.Tensor,
-    stride_block: int,
-    stride_n: int,
-    block_size: int,
-) -> torch.Tensor:
-    if block_size == 1:
-        if stride_block == 1:
-            return block_sparse_indices
-        else:
-            return block_sparse_indices * stride_block
-
-    assert block_sparse_indices.dtype == torch.int32
-    assert block_sparse_indptr.dtype == torch.int32
-    assert vector_sparse_offsets.dtype == torch.int32
-    assert vector_sparse_indptr.dtype == torch.int32
-    assert kv_lens.dtype == torch.int32
-    batch_size = block_sparse_indptr.size(0) - 1
-    get_page_module().block_sparse_indices_to_vector_sparse_offsets(
-        block_sparse_indices,
-        block_sparse_indptr,
-        vector_sparse_offsets,
-        vector_sparse_indptr,
-        kv_lens,
-        stride_block,
-        stride_n,
-        batch_size,
-        block_size,
-    )
-    return vector_sparse_offsets
-
-
 @register_custom_op(
     "flashinfer::append_paged_mla_kv_cache",
     mutates_args=("ckv_cache", "kpe_cache"),
@@ -154,6 +119,7 @@ def _fake_append_paged_kv_cache_kernel(
     pass
 
 
+@flashinfer_api
 def get_batch_indices_positions(
     append_indptr: torch.Tensor, seq_lens: torch.Tensor, nnz: int
 ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -235,6 +201,7 @@ def get_seq_lens(
     )
 
 
+@flashinfer_api
 def append_paged_mla_kv_cache(
     append_ckv: torch.Tensor,
     append_kpe: torch.Tensor,
@@ -284,6 +251,7 @@ def append_paged_mla_kv_cache(
     )
 
 
+@flashinfer_api
 def append_paged_kv_cache(
     append_key: torch.Tensor,
     append_value: torch.Tensor,
diff --git a/flashinfer/pod.py b/flashinfer/pod.py
index 59e113f238..fe2e36c1ef 100644
--- a/flashinfer/pod.py
+++ b/flashinfer/pod.py
@@ -21,7 +21,8 @@
 
 import torch
 
-from .jit import gen_pod_module
+from .api_logging import flashinfer_api
+from .jit import gen_pod_module, gen_batch_pod_module
 from .page import get_seq_lens
 from .prefill import get_batch_prefill_module
 from .quantization import packbits
@@ -47,6 +48,12 @@ def get_pod_module(*args):
     return SimpleNamespace(run_tensor=module.pod_with_kv_cache_tensor)
 
 
+@functools.cache
+def get_batch_pod_module(*args):
+    module = gen_batch_pod_module(*args).build_and_load()
+    return SimpleNamespace(run_tensor=module.batch_pod_with_kv_cache_tensor)
+
+
 class PODWithPagedKVCacheWrapper:
     r"""Wrapper class for POD-Attention with paged kv-cache (first proposed in
     `<https://arxiv.org/abs/2410.18038>`_) for batch of requests.
@@ -113,6 +120,7 @@ class PODWithPagedKVCacheWrapper:
     manages the lifecycle of these data structures.
     """
 
+    @flashinfer_api
     def __init__(
         self,
         float_workspace_buffer: torch.Tensor,
@@ -253,6 +261,7 @@ def reset_workspace_buffer(
             pin_memory=True,
         )
 
+    @flashinfer_api
     def plan(
         self,
         indptr: torch.Tensor,
@@ -413,6 +422,7 @@ def plan(
             window_left,
             -1,  # fixed_split_size
             False,  # disable_split_kv
+            0,  # num_colocated_ctas
         )
 
         self._indptr_type = indptr.dtype
@@ -425,6 +435,7 @@ def plan(
 
     begin_forward = plan
 
+    @flashinfer_api
     def run(
         self,
         # Main params (prefill and decode)
@@ -610,3 +621,581 @@ def run(
     def end_forward(self) -> None:
         r"""Warning: this function is deprecated and has no effect."""
         pass
+
+
+class BatchPODWithPagedKVCacheWrapper:
+    r"""Wrapper class for POD-Attention with paged kv-cache (first proposed in
+    `<https://arxiv.org/abs/2410.18038>`_) for batch of requests.
+
+    Check :ref:`our tutorial<kv-layout>` for page table layout.
+
+    Examples
+    --------
+    >>> import torch
+    >>> import flashinfer
+    >>> num_layers = 8
+    >>> num_qo_heads = 64
+    >>> num_kv_heads = 8
+    >>> head_dim = 128
+    >>> max_num_pages = 128
+    >>> device = 0
+    >>> page_block_size = 1
+    >>> causal = True
+    >>> # allocate 128MB workspace buffer
+    >>> workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
+    >>> wrapper = flashinfer.BatchPODWithPagedKVCacheWrapper(
+    ...     workspace_buffer, "NHD"
+    ... )
+    >>> # Prefill and decode parameters
+    >>> p_qo_lens = [2048] * 2
+    >>> d_qo_lens = [1] * 128
+    >>> p_kv_lens = [2048] * 2
+    >>> d_kv_lens = [2048] * 128
+    >>> # Prefill plan inputs
+    >>> p_seq_lens_blocks = torch.ceil(
+    ...     torch.tensor(p_kv_lens, dtype=torch.int32) / page_block_size
+    ... ).int()
+    >>> p_q_indptr = torch.cat(
+    ...     [torch.tensor([0]), torch.cumsum(torch.tensor(p_qo_lens), 0)], dim=0
+    ... ).int()
+    >>> p_kv_indptr = torch.cat(
+    ...     [torch.tensor([0]), torch.cumsum(p_seq_lens_blocks, 0)], dim=0
+    ... ).int()
+    >>> kv_indices_p = torch.arange(0, p_kv_indptr[-1], device=device, dtype=torch.int32)
+    >>> last_page_len_p = (p_seq_lens_blocks - 1) % page_block_size + 1
+    >>> # Decode plan inputs
+    >>> d_seq_lens_blocks = torch.ceil(
+    ...     torch.tensor(d_kv_lens, dtype=torch.int32) / page_block_size
+    ... ).int()
+    >>> d_q_indptr = torch.cat(
+    ...     [torch.tensor([0]), torch.cumsum(torch.tensor(d_qo_lens), 0)], dim=0
+    ... ).int()
+    >>> d_kv_indptr = torch.cat(
+    ...     [torch.tensor([0]), torch.cumsum(d_seq_lens_blocks, 0)], dim=0
+    ... ).int()
+    >>> kv_indices_d = torch.arange(0, d_kv_indptr[-1], device=device, dtype=torch.int32)
+    >>> last_page_len_d = (d_seq_lens_blocks - 1) % page_block_size + 1
+    >>> # create auxiliary data structures for batch decode attention
+    >>> wrapper.plan(
+    ...     # Prefill params
+    ...     p_q_indptr.to(device),
+    ...     p_kv_indptr.to(device),
+    ...     kv_indices_p.to(device),
+    ...     last_page_len_p,
+    ...     # Decode params
+    ...     d_q_indptr.to(device),
+    ...     d_kv_indptr.to(device),
+    ...     kv_indices_d.to(device),
+    ...     last_page_len_d,
+    ...     # Common params
+    ...     num_qo_heads=num_qo_heads,
+    ...     num_kv_heads=num_kv_heads,
+    ...     head_dim=head_dim,
+    ...     page_size=page_block_size,
+    ...     q_data_type=torch.bfloat16,
+    ...     kv_data_type=torch.bfloat16,
+    ... )
+    >>> # Prefill input tensors
+    >>> q_p = torch.rand(p_q_indptr[-1].item(), num_qo_heads, head_dim).to(
+    ...     device, dtype=torch.bfloat16
+    ... )
+    >>> kv_p = torch.randn(p_kv_indptr[-1], 2, page_block_size, num_kv_heads, head_dim).to(
+    ...     device, dtype=torch.bfloat16
+    ... ).unbind(1)
+    >>> # Decode input tensors
+    >>> q_d = torch.rand(d_q_indptr[-1].item(), num_qo_heads, head_dim).to(
+    ...     device, dtype=torch.bfloat16
+    ... )
+    >>> kv_d = torch.randn(d_kv_indptr[-1], 2, page_block_size, num_kv_heads, head_dim).to(
+    ...     device, dtype=torch.bfloat16
+    ... ).unbind(1)
+    >>> for i in range(num_layers):
+    ...     o_p_batch, o_d_batch = wrapper.run(
+    ...         q_p,
+    ...         kv_p,
+    ...         q_d,
+    ...         kv_d,
+    ...         causal_p=causal,
+    ...     )
+    >>> print(o_p_batch.shape, o_d_batch.shape)
+    torch.Size([4096, 64, 128]) torch.Size([128, 64, 128])
+
+    Note
+    ----
+    To accelerate computation, FlashInfer's POD-Attention creates some
+    auxiliary data structures, these data structures can be reused across multiple
+    batch decode attention calls (e.g. different Transformer layers). This wrapper class
+    manages the lifecycle of these data structures.
+    """
+
+    @flashinfer_api
+    def __init__(
+        self,
+        float_workspace_buffer: torch.Tensor,
+        kv_layout: str = "NHD",
+    ) -> None:
+        r"""Constructor of :class:`BatchPODWithPagedKVCacheWrapper`.
+
+        Parameters
+        ----------
+        float_workspace_buffer : torch.Tensor
+            The user reserved float workspace buffer used to store intermediate attention results
+            in the split-k algorithm. The recommended size is 128MB, the device of the workspace
+            buffer should be the same as the device of the input tensors.
+
+        kv_layout : str
+            The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.
+
+        """
+        _check_kv_layout(kv_layout)
+        # Override options. Only tensor core version is performant.
+        use_tensor_cores = True
+        self._jit_module: SimpleNamespace = None
+
+        self._kv_layout = kv_layout
+        float_workspace_buffer_p, float_workspace_buffer_d = torch.chunk(
+            float_workspace_buffer, 2, dim=0
+        )
+        self._float_workspace_buffer_p = float_workspace_buffer_p
+        self._float_workspace_buffer_d = float_workspace_buffer_d
+        self.device = float_workspace_buffer_p.device
+        self._int_workspace_buffer_p = torch.empty(
+            (8 * 1024 * 1024,), dtype=torch.uint8, device=self.device
+        )
+        self._int_workspace_buffer_d = torch.empty(
+            (8 * 1024 * 1024,), dtype=torch.uint8, device=self.device
+        )
+        self._pin_memory_int_workspace_buffer_p = torch.empty(
+            (8 * 1024 * 1024,),
+            dtype=torch.uint8,
+            pin_memory=True,
+            device="cpu",
+        )
+        self._pin_memory_int_workspace_buffer_d = torch.empty(
+            (8 * 1024 * 1024,),
+            dtype=torch.uint8,
+            pin_memory=True,
+            device="cpu",
+        )
+
+        # SM aware scheduling buffer, requires SMs count + 2 entries
+        dev_prop = torch.cuda.get_device_properties(self.device)
+        self._sm_aware_sched = torch.empty(
+            (dev_prop.multi_processor_count + 2), dtype=torch.int, device=self.device
+        )
+
+        self._fixed_batch_size = 0
+
+        self._paged_kv_indptr_buf = None
+        self._paged_kv_indices_buf = None
+        self._paged_kv_last_page_len_buf = None
+        self._use_tensor_cores = use_tensor_cores
+        self._use_cuda_graph = False
+
+    @property
+    def is_cuda_graph_enabled(self) -> bool:
+        return self._use_cuda_graph
+
+    @flashinfer_api
+    def plan(
+        self,
+        qo_indptr_p: torch.Tensor,
+        kv_indptr_p: torch.Tensor,
+        kv_indices_p: torch.Tensor,
+        last_page_len_p: torch.Tensor,
+        qo_indptr_d: torch.Tensor,
+        kv_indptr_d: torch.Tensor,
+        kv_indices_d: torch.Tensor,
+        last_page_len_d: torch.Tensor,
+        num_qo_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        page_size: int,
+        pos_encoding_mode: str = "NONE",
+        window_left: int = -1,
+        q_data_type: Optional[Union[str, torch.dtype]] = "float16",
+        kv_data_type: Optional[Union[str, torch.dtype]] = None,
+        data_type: Optional[Union[str, torch.dtype]] = None,
+        sm_scale: Optional[float] = None,
+        rope_scale: Optional[float] = None,
+        rope_theta: Optional[float] = None,
+        non_blocking: bool = True,
+    ) -> None:
+        r"""Plan POD's batch prefill and decode for given problem specification.
+
+        Parameters
+        ----------
+        qo_indptr_p : torch.Tensor
+            The prefill indptr of the query/output tensor, shape: ``[batch_size + 1]``.
+        kv_indptr_p : torch.Tensor
+            The prefill indptr of the paged kv-cache, shape: ``[batch_size + 1]``.
+        kv_indices_p : torch.Tensor
+            The prefill page indices of the paged kv-cache, shape: ``[kv_indptr[-1]]``.
+        last_page_len_p : torch.Tensor
+            The number of entries in the last page of each prefill request in the paged
+            kv-cache, shape: ``[batch_size]``.
+        qo_indptr_d : torch.Tensor
+            The decode indptr of the query/output tensor, shape: ``[batch_size + 1]``.
+        kv_indptr_d : torch.Tensor
+            The decode indptr of the paged kv-cache, shape: ``[batch_size + 1]``.
+        kv_indices_d : torch.Tensor
+            The decode page indices of the paged kv-cache, shape: ``[kv_indptr[-1]]``.
+        last_page_len_d : torch.Tensor
+            The number of entries in the last page of each decode request in the paged
+            kv-cache, shape: ``[batch_size]``.
+        num_qo_heads : int
+            The number of query/output heads
+        num_kv_heads : int
+            The number of key/value heads
+        head_dim : int
+            The dimension of the heads
+        page_size : int
+            The page size of the paged kv cache
+        pos_encoding_mode : str
+            The position encoding applied inside attention kernels, could be
+            ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
+            Defaults to ``NONE``.
+        window_left : int
+            The left (inclusive) window size for the attention window, when set to ``-1``, the window
+            size will be set to the full length of the sequence. Defaults to ``-1``.
+        q_data_type : Optional[Union[str, torch.dtype]]
+            The data type of the query tensor, defaults torch.float16.
+        kv_data_type : Optional[Union[str, torch.dtype]]
+            The data type of the key/value tensor. If None, will be set to
+            ``q_data_type``. Defaults to ``None``.
+        data_type: Optional[Union[str, torch.dtype]]
+            The data type of both the query and key/value tensors. Defaults to torch.float16.
+            data_type is deprecated, please use q_data_type and kv_data_type instead.
+        sm_scale : Optional[float]
+            The scale used in softmax, if not provided, will be set to
+            ``1.0 / sqrt(head_dim_qk)``.
+        rope_scale : Optional[float]
+            The scale used in RoPE interpolation, if not provided, will be set to
+            ``1.0``.
+        rope_theta : Optional[float]
+            The theta used in RoPE, if not provided, will be set to ``1e4``.
+        non_blocking : bool
+            Whether to copy the input tensors to the device asynchronously, defaults to ``True``.
+
+        Note
+        ----
+        The :meth:`plan` method should be called before any :meth:`run` or
+        :meth:`run_return_lse` calls, auxiliary data structures will be created
+        during this call and cached for multiple run calls.
+
+        The ``num_qo_heads`` must be a multiple of ``num_kv_heads``. If ``num_qo_heads``
+        is not equal to ``num_kv_heads``, the function will use
+        `grouped query attention <https://arxiv.org/abs/2305.13245>`_.
+
+        The :meth:`plan` method cannot be used in Cuda Graph or in ``torch.compile``.
+        """
+        # Logits soft cap is not supported currently
+        logits_soft_cap = 0.0
+
+        # Setup prefill params
+        batch_size_p = len(last_page_len_p)
+        qo_indptr_host_p = qo_indptr_p.to("cpu")
+        total_num_rows_p = int(qo_indptr_host_p[-1])
+        self._kv_indptr_buf_p = kv_indptr_p.to(self.device, non_blocking=non_blocking)
+        self._kv_indices_buf_p = kv_indices_p.to(self.device, non_blocking=non_blocking)
+        self._kv_last_page_len_buf_p = last_page_len_p.to(
+            self.device, non_blocking=non_blocking
+        )
+        self._qo_indptr_buf_p = qo_indptr_host_p.to(
+            self.device, non_blocking=non_blocking
+        )
+        kv_indptr_host_p = kv_indptr_p.to("cpu")
+        last_page_len_host_p = last_page_len_p.to("cpu")
+        kv_lens_arr_host_p = get_seq_lens(
+            kv_indptr_host_p, last_page_len_host_p, page_size
+        )
+
+        if data_type is not None:
+            if q_data_type is None:
+                q_data_type = data_type
+            if kv_data_type is None:
+                kv_data_type = data_type
+
+        q_data_type = canonicalize_torch_dtype(q_data_type)
+        if kv_data_type is None:
+            kv_data_type = q_data_type
+        kv_data_type = canonicalize_torch_dtype(kv_data_type)
+
+        self._cached_q_data_type = q_data_type
+        self._cached_kv_data_type = kv_data_type
+        if self._jit_module is not None:
+            self._cached_module = self._jit_module
+        else:
+            self._cached_module = get_batch_prefill_module(
+                "fa2",
+                q_data_type,
+                kv_data_type,
+                q_data_type,
+                kv_indptr_p.dtype,
+                head_dim,  # head_dim_qk
+                head_dim,  # head_dim_vo
+                PosEncodingMode[pos_encoding_mode].value,
+                window_left != -1,  # use_sliding_window
+                logits_soft_cap > 0,  # use_logits_soft_cap
+                False,  # use_fp16_qk_reduction
+            )
+
+        # Setup decode params
+        batch_size_d = len(last_page_len_d)
+        qo_indptr_host_d = qo_indptr_d.to("cpu")
+        total_num_rows_d = int(qo_indptr_host_d[-1])
+        self._kv_indptr_buf_d = kv_indptr_d.to(self.device, non_blocking=non_blocking)
+        self._kv_indices_buf_d = kv_indices_d.to(self.device, non_blocking=non_blocking)
+        self._kv_last_page_len_buf_d = last_page_len_d.to(
+            self.device, non_blocking=non_blocking
+        )
+        self._qo_indptr_buf_d = qo_indptr_host_d.to(
+            self.device, non_blocking=non_blocking
+        )
+        kv_indptr_host_d = kv_indptr_d.to("cpu")
+        last_page_len_host_d = last_page_len_d.to("cpu")
+        kv_lens_arr_host_d = get_seq_lens(
+            kv_indptr_host_d, last_page_len_host_d, page_size
+        )
+
+        self._plan_info_d = self._cached_module.plan(
+            self._float_workspace_buffer_d,
+            self._int_workspace_buffer_d,
+            self._pin_memory_int_workspace_buffer_d,
+            qo_indptr_host_d,
+            kv_indptr_host_d,
+            kv_lens_arr_host_d,
+            total_num_rows_d,  # total_num_rows
+            batch_size_d,
+            num_qo_heads,
+            num_kv_heads,
+            page_size,
+            self.is_cuda_graph_enabled,
+            head_dim,
+            head_dim,
+            False,  # causal
+            window_left,
+            -1,  # fixed_split_size
+            False,  # disable_split_kv
+            0,  # num_colocated_ctas
+        )
+
+        num_colocated_ctas = self._plan_info_d[0]
+        # Splitting small prefill causes unecessary bandwidth contention
+        if total_num_rows_p > 1536:
+            num_colocated_ctas = 0
+        self._plan_info_p = self._cached_module.plan(
+            self._float_workspace_buffer_p,
+            self._int_workspace_buffer_p,
+            self._pin_memory_int_workspace_buffer_p,
+            qo_indptr_host_p,
+            kv_indptr_host_p,
+            kv_lens_arr_host_p,
+            total_num_rows_p,  # total_num_rows
+            batch_size_p,
+            num_qo_heads,
+            num_kv_heads,
+            page_size,
+            self.is_cuda_graph_enabled,
+            head_dim,
+            head_dim,
+            False,  # causal
+            window_left,
+            -1,  # fixed_split_size
+            False,  # disable_split_kv
+            num_colocated_ctas,
+        )
+        self._indptr_type = kv_indptr_p.dtype
+        self._pos_encoding_mode = pos_encoding_mode
+        self._window_left = window_left
+        self._logits_soft_cap = logits_soft_cap
+        self._sm_scale = sm_scale
+        self._rope_scale = rope_scale
+        self._rope_theta = rope_theta
+
+    begin_forward = plan
+
+    @flashinfer_api
+    def run(
+        self,
+        # Main params (prefill and decode)
+        q_p: torch.Tensor,
+        paged_kv_cache_p: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        q_d: torch.Tensor,
+        paged_kv_cache_d: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        # Prefill options
+        custom_mask_p: Optional[torch.Tensor] = None,
+        packed_custom_mask_p: Optional[torch.Tensor] = None,
+        causal_p: bool = False,
+        # Decode options
+        q_scale: Optional[float] = None,
+        k_scale: Optional[float] = None,
+        v_scale: Optional[float] = None,
+        # Common options
+        return_lse: bool = False,
+        use_fp16_qk_reduction: bool = False,
+        enable_pdl: Optional[bool] = None,
+    ) -> Union[
+        Tuple[torch.Tensor, torch.Tensor],
+        Tuple[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor]],
+    ]:
+        r"""Compute POD-attention for a batch of requests."""
+        if enable_pdl is None:
+            enable_pdl = device_support_pdl(q_p.device)
+
+        # Currently unsupported
+        logits_soft_cap_p = None
+        logits_soft_cap_d = None
+        # Prefill setup
+        k_cache_p, v_cache_p = _unpack_paged_kv_cache(paged_kv_cache_p, self._kv_layout)
+        _check_cached_qkv_data_type(
+            q_p, k_cache_p, self._cached_q_data_type, self._cached_kv_data_type
+        )
+        # Get params from plan
+        pos_encoding_mode_p = self._pos_encoding_mode
+        window_left_p = self._window_left
+        logits_soft_cap_p = self._logits_soft_cap
+        sm_scale_p = self._sm_scale
+        rope_scale_p = self._rope_scale
+        rope_theta_p = self._rope_theta
+        _check_pos_encoding_mode(pos_encoding_mode_p)
+        if logits_soft_cap_p is None:
+            logits_soft_cap_p = 0.0
+        if sm_scale_p is None:
+            head_dim = q_p.shape[-1]
+            sm_scale_p = 1.0 / math.sqrt(head_dim)
+        if rope_scale_p is None:
+            rope_scale_p = 1.0
+        if rope_theta_p is None:
+            rope_theta_p = 1e4
+
+        if custom_mask_p is not None and packed_custom_mask_p is None:
+            # create packed custom mask from custom mask
+            packed_custom_mask_p = packbits(
+                custom_mask_p.contiguous().view(-1), bitorder="little"
+            )
+
+        if packed_custom_mask_p is not None:
+            mask_mode_p = MaskMode.CUSTOM.value
+        else:
+            if causal_p:
+                mask_mode_p = MaskMode.CAUSAL.value
+            else:
+                mask_mode_p = MaskMode.NON_CAUSAL.value
+
+        lse_p = None
+        if return_lse:
+            lse_p = torch.empty(
+                (q_p.size(0), q_p.size(1)), dtype=torch.float32, device=q_p.device
+            )
+        out_p = torch.empty_like(q_p)
+
+        # Decode setup
+        k_cache_d, v_cache_d = _unpack_paged_kv_cache(paged_kv_cache_d, self._kv_layout)
+        _check_cached_qkv_data_type(
+            q_d, k_cache_d, self._cached_q_data_type, self._cached_kv_data_type
+        )
+        # Get params from plan
+        pos_encoding_mode_d = self._pos_encoding_mode
+        window_left_d = self._window_left
+        logits_soft_cap_d = self._logits_soft_cap
+        sm_scale_d = self._sm_scale
+        rope_scale_d = self._rope_scale
+        rope_theta_d = self._rope_theta
+        _check_pos_encoding_mode(pos_encoding_mode_d)
+        if logits_soft_cap_d is None:
+            logits_soft_cap_d = 0.0
+        if sm_scale_d is None:
+            head_dim = q_d.shape[-1]
+            sm_scale_d = 1.0 / math.sqrt(head_dim)
+        if q_scale is not None:
+            sm_scale_d *= q_scale
+        if k_scale is not None:
+            sm_scale_d *= k_scale
+        if rope_scale_d is None:
+            rope_scale_d = 1.0
+        if rope_theta_d is None:
+            rope_theta_d = 1e4
+
+        lse_d = None
+        if return_lse:
+            lse_d = torch.empty(
+                (q_d.size(0), q_d.size(1)), dtype=torch.float32, device=q_d.device
+            )
+        out_d = torch.empty_like(q_d)
+
+        module_getter = get_batch_pod_module(
+            # Prefill params
+            q_p.dtype,
+            k_cache_p.dtype,
+            q_p.dtype,
+            q_p.shape[-1],
+            PosEncodingMode[pos_encoding_mode_p].value,
+            window_left_p >= 0,  # use_sliding_window
+            logits_soft_cap_p > 0,  # use_logits_soft_cap
+            use_fp16_qk_reduction,
+            # Decode params
+            self._indptr_type,
+            PosEncodingMode[pos_encoding_mode_d].value,
+            window_left_d != -1,  # use_sliding_window
+            logits_soft_cap_d > 0,  # use_logits_soft_cap
+        )
+        module_getter.run_tensor(
+            # Prefill params
+            self._float_workspace_buffer_p,
+            self._int_workspace_buffer_p,
+            self._plan_info_p,
+            q_p,
+            k_cache_p,
+            v_cache_p,
+            self._qo_indptr_buf_p,
+            self._kv_indptr_buf_p,
+            self._kv_indices_buf_p,
+            self._kv_last_page_len_buf_p,
+            out_p,
+            lse_p,
+            mask_mode_p,
+            TensorLayout[self._kv_layout].value,
+            window_left_p,
+            packed_custom_mask_p,  # packed_custom_mask
+            None,  # mask_indptr_buf
+            _get_cache_alibi_slopes_buf(q_p.shape[1], q_p.device),
+            logits_soft_cap_p,
+            sm_scale_p,
+            1.0 / rope_scale_p,
+            1.0 / rope_theta_p,
+            # Decode params
+            self._float_workspace_buffer_d,
+            self._int_workspace_buffer_d,
+            self._plan_info_d,
+            q_d,
+            k_cache_d,
+            v_cache_d,
+            self._qo_indptr_buf_d,
+            self._kv_indptr_buf_d,
+            self._kv_indices_buf_d,
+            self._kv_last_page_len_buf_d,
+            out_d,
+            lse_d,
+            MaskMode.NON_CAUSAL.value,
+            TensorLayout[self._kv_layout].value,
+            window_left_d,
+            None,  # packed_custom_mask
+            None,  # mask_indptr_buf
+            _get_cache_alibi_slopes_buf(q_d.shape[1], q_d.device),
+            logits_soft_cap_d,
+            sm_scale_d,
+            1.0 / rope_scale_d,
+            1.0 / rope_theta_d,
+            enable_pdl,
+            self._sm_aware_sched,
+        )
+
+        if v_scale is not None:
+            out_d *= v_scale
+
+        return ((out_p, out_d), (lse_p, lse_d)) if return_lse else (out_p, out_d)
+
+    def end_forward(self) -> None:
+        r"""Warning: this function is deprecated and has no effect."""
+        pass
diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
index 7399bd4268..9fd1a5c0fa 100755
--- a/flashinfer/prefill.py
+++ b/flashinfer/prefill.py
@@ -22,6 +22,7 @@
 
 import torch
 
+from .api_logging import flashinfer_api
 from .jit import (
     gen_batch_prefill_module,
     gen_customize_batch_prefill_module,
@@ -31,11 +32,13 @@
     get_single_prefill_uri,
     setup_cubin_loader,
     gen_trtllm_gen_fmha_module,
+    get_trtllm_fmha_v2_module,
 )
 from .cudnn import cudnn_batch_prefill_with_kv_cache
-from .page import block_sparse_indices_to_vector_sparse_offsets, get_seq_lens
+from .page import get_seq_lens
 from .quantization import packbits, segment_packbits
 from .utils import (
+    log2e,
     FP4Tensor,
     MaskMode,
     PosEncodingMode,
@@ -63,6 +66,26 @@
 )
 
 
+def _split_scale_param(scale):
+    """Split scale parameter into tensor and scalar components.
+
+    Args:
+        scale: Can be a torch.Tensor (per-head), a scalar float, or None.
+
+    Returns:
+        tuple: (tensor_ptr, scalar_val) where:
+            - If scale is tensor: (scale, 1.0)
+            - If scale is scalar: (None, scale)
+            - If scale is None: (None, 1.0)
+    """
+    if scale is None:
+        return None, 1.0
+    elif isinstance(scale, torch.Tensor):
+        return scale, 1.0
+    else:
+        return None, float(scale)
+
+
 @functools.cache
 def get_fmha_module(
     dtype_q: torch.dtype,
@@ -190,8 +213,8 @@ def _paged_run(
         seq_lens: torch.Tensor,
         max_q_len: int,
         max_kv_len: int,
-        bmm1_scale: float,
-        bmm2_scale: float,
+        bmm1_scale: Union[float, torch.Tensor],
+        bmm2_scale: Union[float, torch.Tensor],
         batch_size: int,
         cum_seq_lens_q: torch.Tensor,
         cum_seq_lens_kv: torch.Tensor,
@@ -204,12 +227,11 @@ def _paged_run(
         sm_count = get_device_sm_count(query.device)
         if out is None:
             out = torch.empty_like(query)
-        bmm1_scale = (
-            bmm1_scale.item() if isinstance(bmm1_scale, torch.Tensor) else bmm1_scale
-        )
-        bmm2_scale = (
-            bmm2_scale.item() if isinstance(bmm2_scale, torch.Tensor) else bmm2_scale
-        )
+        if isinstance(bmm1_scale, torch.Tensor):
+            assert bmm1_scale.dtype == torch.float32
+            bmm1_scale = bmm1_scale * log2e
+        if isinstance(bmm2_scale, torch.Tensor):
+            assert bmm2_scale.dtype == torch.float32
         op.trtllm_paged_attention_context(
             out,
             None,  # fp4 output not supported in wrapper api yet.
@@ -286,6 +308,7 @@ def run_single_prefill(
         rope_theta: float,
     ) -> None:
         if backend == "fa3":
+            scale_v_tensor, scale_v_scalar = _split_scale_param(scale_v)
             if not is_float8(q):
                 run_func(
                     q,
@@ -297,11 +320,15 @@ def run_single_prefill(
                     mask_mode,
                     layout,
                     window_left,
+                    scale_v_tensor,
                     logits_soft_cap,
                     sm_scale,
+                    scale_v_scalar,
                 )
             else:
                 # FP8 enabled
+                scale_q_tensor, scale_q_scalar = _split_scale_param(scale_q)
+                scale_k_tensor, scale_k_scalar = _split_scale_param(scale_k)
                 run_func(
                     q,
                     k,
@@ -312,10 +339,13 @@ def run_single_prefill(
                     mask_mode,
                     layout,
                     window_left,
-                    scale_q,
-                    scale_k,
-                    scale_v,
+                    scale_q_tensor,
+                    scale_k_tensor,
+                    scale_v_tensor,
                     sm_scale,
+                    scale_q_scalar,
+                    scale_k_scalar,
+                    scale_v_scalar,
                 )
         else:
             run_func(
@@ -413,7 +443,13 @@ def ragged_run(
         rope_scale: float,
         rope_theta: float,
         token_pos_in_items_len: int,
+        scale_q: Optional[torch.Tensor] = None,
+        scale_k: Optional[torch.Tensor] = None,
+        scale_v: Optional[torch.Tensor] = None,
     ) -> None:
+        # Check if FP8 by presence of scale tensors
+        is_fp8 = scale_q is not None
+
         if backend == "fa2":
             ragged_run_func(
                 float_workspace_buffer,
@@ -439,10 +475,41 @@ def ragged_run(
                 logits_soft_cap,
                 sm_scale,
                 1.0 / rope_scale,  # rope_rcp_scale
-                1.0 / rope_theta,  # rope_rcp_theta
+                1.0 / rope_theta,  # rope_rcp_theta,
                 token_pos_in_items_len,
             )
+        elif is_fp8:
+            # FA3 FP8: scale_q, scale_k, scale_v, sm_scale, scale_q_scalar, scale_k_scalar, scale_v_scalar
+            scale_q_tensor, scale_q_scalar = _split_scale_param(scale_q)
+            scale_k_tensor, scale_k_scalar = _split_scale_param(scale_k)
+            scale_v_tensor, scale_v_scalar = _split_scale_param(scale_v)
+            ragged_run_func(
+                float_workspace_buffer,
+                int_workspace_buffer,
+                plan_info_vec,
+                q,
+                k,
+                v,
+                qo_indptr,
+                kv_indptr,
+                o,
+                maybe_lse,
+                mask_mode,
+                layout,
+                window_left,
+                enable_pdl,
+                scale_q_tensor,
+                scale_k_tensor,
+                scale_v_tensor,
+                sm_scale,
+                scale_q_scalar,
+                scale_k_scalar,
+                scale_v_scalar,
+            )
         else:
+            # FA3 FP16: maybe_prefix_len_ptr, maybe_token_pos_in_items_ptr,
+            # maybe_max_item_len_ptr, scale_v, logits_soft_cap, sm_scale, scale_v_scalar, token_pos_in_items_len
+            scale_v_tensor, scale_v_scalar = _split_scale_param(scale_v)
             ragged_run_func(
                 float_workspace_buffer,
                 int_workspace_buffer,
@@ -461,8 +528,10 @@ def ragged_run(
                 maybe_prefix_len_ptr,
                 maybe_token_pos_in_items_ptr,
                 maybe_max_item_len_ptr,
+                scale_v_tensor,
                 logits_soft_cap,
                 sm_scale,
+                scale_v_scalar,
                 token_pos_in_items_len,
             )
 
@@ -620,6 +689,7 @@ def paged_run(
                 token_pos_in_items_len,
             )
         else:
+            scale_v_tensor, scale_v_scalar = _split_scale_param(scale_v)
             if not is_float8(q):
                 paged_run_func(
                     float_workspace_buffer,
@@ -641,11 +711,15 @@ def paged_run(
                     maybe_prefix_len_ptr,
                     maybe_token_pos_in_items_ptr,
                     maybe_max_item_len_ptr,
+                    scale_v_tensor,
                     logits_soft_cap,
                     sm_scale,
+                    scale_v_scalar,
                     token_pos_in_items_len,
                 )
             else:
+                scale_q_tensor, scale_q_scalar = _split_scale_param(scale_q)
+                scale_k_tensor, scale_k_scalar = _split_scale_param(scale_k)
                 paged_run_func(
                     float_workspace_buffer,
                     int_workspace_buffer,
@@ -663,10 +737,13 @@ def paged_run(
                     layout,
                     window_left,
                     enable_pdl,
-                    scale_q,
-                    scale_k,
-                    scale_v,
+                    scale_q_tensor,
+                    scale_k_tensor,
+                    scale_v_tensor,
                     sm_scale,
+                    scale_q_scalar,
+                    scale_k_scalar,
+                    scale_v_scalar,
                 )
         return o
 
@@ -873,6 +950,7 @@ def _fake_paged_run(
     )
 
 
+@flashinfer_api
 def single_prefill_with_kv_cache_with_jit_module(
     jit_module: Any,
     q: torch.Tensor,
@@ -957,6 +1035,7 @@ def single_prefill_with_kv_cache(
 ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
 
+@flashinfer_api
 def single_prefill_with_kv_cache(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -1325,6 +1404,7 @@ class BatchPrefillWithPagedKVCacheWrapper:
     wrapper class manages the lifecycle of these data structures.
     """
 
+    @flashinfer_api
     def __init__(
         self,
         float_workspace_buffer: torch.Tensor,
@@ -1391,7 +1471,8 @@ def __init__(
             mask will be used in attention computation.
 
         backend : str
-            The implementation backend, could be ``auto``/``fa2``,``fa3`` or ``cudnn``. Defaults to ``auto``.
+            The implementation backend, could be ``auto``/``fa2``/``fa3``/``cudnn`` or ``trtllm-gen``.
+            Defaults to ``auto``.
             If set to ``auto``, the wrapper will automatically choose the backend based on the
             device architecture and kernel availability.
 
@@ -1424,16 +1505,6 @@ def __init__(
             * self._float_workspace_buffer.element_size()
         )
         self.device = float_workspace_buffer.device
-        self._vector_sparse_indptr_buffer: Optional[torch.Tensor] = None
-        if backend in ["fa3", "auto", "trtllm-gen"]:
-            # NOTE(Zihao): assume maximum accumulate kv length is 16M
-            self._vector_sparse_indices_buffer = torch.empty(
-                (16 * 1024 * 1024,), dtype=torch.int32, device=self.device
-            )
-            # NOTE(Zihao): assume maximum batch size is 32768
-            self._vector_sparse_indptr_buffer = torch.empty(
-                (32768,), dtype=torch.int32, device=self.device
-            )
 
         self._kv_lens_buffer = torch.empty(
             (32768,), dtype=torch.int32, device=self.device
@@ -1520,6 +1591,7 @@ def reset_workspace_buffer(
             pin_memory=True,
         )
 
+    @flashinfer_api
     def plan(
         self,
         qo_indptr: torch.Tensor,
@@ -1543,6 +1615,7 @@ def plan(
         rope_theta: Optional[float] = None,
         q_data_type: Union[str, torch.dtype] = "float16",
         kv_data_type: Optional[Union[str, torch.dtype]] = None,
+        o_data_type: Optional[Union[str, torch.dtype]] = None,
         non_blocking: bool = True,
         prefix_len_ptr: Optional[torch.Tensor] = None,
         token_pos_in_items_ptr: Optional[torch.Tensor] = None,
@@ -1627,11 +1700,14 @@ def plan(
             The data type of the query tensor, defaults torch.float16.
         kv_data_type : Optional[Union[str, torch.dtype]]
             The data type of the key/value tensor. If None, will be set to :attr:`q_data_type`.
+        o_data_type : Optional[Union[str, torch.dtype]]
+            The data type of the output tensor. If None, will be set to :attr:`q_data_type`.
+            For FP8 inputs, this should typically be set to torch.float16.
         non_blocking : bool
             Whether to copy the input tensors to the device asynchronously, defaults to ``True``.
         prefix_len_ptr :Optional[torch.Tensor]
             prefix length. A uint32 1D tensor indicating the prefix length of each prompt. The tensor size is equal to the batch size.
-        token_pos_in_items_ptr : Optional[float]
+        token_pos_in_items_ptr : Optional[torch.Tensor]
             A uint16 1D tensor (it will be converted to uint16 in flashinfer) indicating the token position of each item and started from 0 (delimiter)
             for each item. E.g., if we have 3 items of length 3, 2, 4 respectively for this member. This vector will be looking like
             `[0, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4, 0]` with 4 delimiters indexed as 0. For batch size > 1,
@@ -1641,7 +1717,7 @@ def plan(
             zero padding length for `token_pos_in_items_ptr` to better handle the bsz > 1 case. Still using the above 3,2,4 example.
             If we set `token_pos_in_items_len` to be 20, it will be  `[0, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0]`
             with 7 padded zeros. (note there're 8 zeros in the end where the first one is the delimiter token 0 in the end of the prompt)
-        max_item_len_ptr : Optional[float]
+        max_item_len_ptr : Optional[torch.Tensor]
             a uint16 vector contains the max token length of all items for each prompt
         seq_lens: Optional[torch.Tensor]
             A uint32 1D tensor indicating the kv sequence length of each prompt. shape: ``[batch_size]``.
@@ -1678,6 +1754,9 @@ def plan(
         if kv_data_type is None:
             kv_data_type = q_data_type
         kv_data_type = canonicalize_torch_dtype(kv_data_type)
+        if o_data_type is None:
+            o_data_type = q_data_type
+        o_data_type = canonicalize_torch_dtype(o_data_type)
 
         if logits_soft_cap is None:
             logits_soft_cap = 0.0
@@ -1808,6 +1887,7 @@ def plan(
 
         self._cached_q_data_type = q_data_type
         self._cached_kv_data_type = kv_data_type
+        self._cached_o_data_type = o_data_type
 
         if self._jit_module is not None:
             self._cached_module = self._jit_module
@@ -1825,7 +1905,7 @@ def plan(
                 get_module_args = (
                     q_data_type,
                     kv_data_type,
-                    q_data_type,
+                    o_data_type,
                     paged_kv_indptr.dtype,
                     head_dim_qk,
                     head_dim_vo,
@@ -1839,25 +1919,13 @@ def plan(
                     self._backend, *get_module_args
                 )
 
-        if self._backend == "fa3" or self._backend == "trtllm-gen":
-            if page_size != 1:
-                vector_sparse_indptr_host = torch.cat(
-                    [
-                        torch.tensor(
-                            [0], dtype=torch.int32, device=kv_lens_arr_host.device
-                        ),
-                        torch.cumsum(kv_lens_arr_host, dim=0, dtype=torch.int32),
-                    ],
-                    dim=0,
-                )
-                self._vector_sparse_indptr_buffer[
-                    : len(vector_sparse_indptr_host)
-                ].copy_(vector_sparse_indptr_host, non_blocking=non_blocking)
-                paged_kv_indptr_host = vector_sparse_indptr_host
-
         self._block_tables = block_tables
         if self._backend == "trtllm-gen":
-            assert self._kv_layout == "HND"
+            if not causal:
+                raise NotImplementedError(
+                    "Non-causal attention is not supported for trtllm-gen backend with paged KV cache. "
+                    "Please use causal=True or choose a different backend (e.g., fa2, fa3, cudnn)."
+                )
             assert logits_soft_cap == 0.0
             if self._block_tables is None:
                 blocks_per_seq = [
@@ -1903,6 +1971,7 @@ def plan(
             if self._backend == "fa2":
                 args.append(fixed_split_size or -1)  # fixed_split_size
                 args.append(disable_split_kv)  # disable_split_kv
+                args.append(0)  # num_colocated_ctas
             self._plan_info = self._cached_module.plan(
                 *args,
             )
@@ -1976,6 +2045,7 @@ def run(
         window_left: Optional[int] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
+    @flashinfer_api
     def run(
         self,
         q: torch.Tensor,
@@ -2041,13 +2111,11 @@ def run(
         _check_cached_qkv_data_type(
             q, k_cache, self._cached_q_data_type, self._cached_kv_data_type
         )
-        stride_block = k_cache.stride(0)
+
         if self._kv_layout == "NHD":
             page_size = k_cache.shape[1]
-            stride_n = k_cache.stride(1)
         else:
             page_size = k_cache.shape[2]
-            stride_n = k_cache.stride(2)
         window_left = self._window_left if window_left is None else window_left
         if self._backend != "trtllm-gen":
             # NOTE(Siyuan): since window_left is appeared in the plan function, we need to make sure it is the same as the one in the plan function.
@@ -2080,14 +2148,23 @@ def run(
                 )
 
         if out is None:
+            # Use cached output data type if available (for FP8 attention with FP16 output)
+            out_dtype = getattr(self, "_cached_o_data_type", None) or q.dtype
             out = torch.empty(
-                q.shape[:-1] + v_cache.shape[-1:], dtype=q.dtype, device=q.device
+                q.shape[:-1] + v_cache.shape[-1:], dtype=out_dtype, device=q.device
             )
         else:
+            out_dtype = getattr(self, "_cached_o_data_type", None) or q.dtype
             check_shape_dtype_device(
-                out, q.shape[:-1] + v_cache.shape[-1:], q.dtype, q.device, "out"
+                out, q.shape[:-1] + v_cache.shape[-1:], out_dtype, q.device, "out"
             )
 
+        # Convert NHD layout to HND for trtllm-gen backend
+        if self._backend == "trtllm-gen" and self._kv_layout == "NHD":
+            # For NHD: [..., N, H, D] -> HND: [..., H, N, D]
+            k_cache = k_cache.transpose(-3, -2)
+            v_cache = v_cache.transpose(-3, -2)
+
         if self._custom_mask_buf is not None:
             mask_mode = MaskMode.CUSTOM.value
         else:
@@ -2099,24 +2176,6 @@ def run(
         if self._prefix_len_ptr is not None:
             mask_mode = MaskMode.MULTIITEMSCORING.value
 
-        if self._backend == "fa3":
-            # NOTE(Zihao): we divide both stride_block and stride_n by stride_n
-            # because we will multiply stride_n back in the kernel
-            sparse_indices = block_sparse_indices_to_vector_sparse_offsets(
-                self._paged_kv_indices_buf,
-                self._paged_kv_indptr_buf,
-                self._vector_sparse_indices_buffer,  # output
-                self._vector_sparse_indptr_buffer,
-                self._kv_lens_buffer,
-                stride_block // stride_n,
-                1,  # stride_n // stride_n
-                page_size,
-            )
-            sparse_indptr = self._vector_sparse_indptr_buffer
-        else:
-            sparse_indices = self._paged_kv_indices_buf
-            sparse_indptr = self._paged_kv_indptr_buf
-
         if self._backend == "cudnn":
             if self._seq_lens_q is not None and self._seq_lens_q.dim() == 1:
                 self._seq_lens_q = self._seq_lens_q.reshape(self._batch_size, 1, 1, 1)
@@ -2153,8 +2212,8 @@ def run(
                 k_cache,
                 v_cache,
                 self._qo_indptr_buf,
-                sparse_indptr,
-                sparse_indices,
+                self._paged_kv_indptr_buf,
+                self._paged_kv_indices_buf,
                 self._paged_kv_last_page_len_buf,
                 out,
                 lse,
@@ -2166,6 +2225,14 @@ def run(
             if self._jit_module is not None:
                 run_args.extend(list(args))
             else:
+                # Extract FP8 scale tensors from *args if q is FP8
+                fp8_scale_q = None
+                fp8_scale_k = None
+                fp8_scale_v = None
+                if is_float8(q) and len(args) >= 3:
+                    fp8_scale_q = args[0]
+                    fp8_scale_k = args[1]
+                    fp8_scale_v = args[2]
                 run_args += [
                     self._custom_mask_buf,
                     self._mask_indptr_buf,
@@ -2175,9 +2242,9 @@ def run(
                     self._max_item_len_ptr,
                     logits_soft_cap,
                     sm_scale,
-                    None,  # scale_q, not supported yet
-                    None,  # scale_k
-                    None,  # scale_v
+                    fp8_scale_q,
+                    fp8_scale_k,
+                    fp8_scale_v,
                     rope_scale,
                     rope_theta,
                     self._token_pos_in_items_len,
@@ -2191,7 +2258,7 @@ def run(
                     self._max_kv_len,
                     self._batch_size,
                     self._qo_indptr_buf,
-                    self._vector_sparse_indptr_buffer,
+                    self._paged_kv_indptr_buf,
                     sinks,
                 ]
 
@@ -2343,6 +2410,7 @@ class BatchPrefillWithRaggedKVCacheWrapper:
     wrapper class manages the lifecycle of these data structures.
     """
 
+    @flashinfer_api
     def __init__(
         self,
         float_workspace_buffer: torch.Tensor,
@@ -2395,7 +2463,7 @@ def __init__(
             will be used in attention computation.
 
         backend : str
-            The implementation backend, could be ``auto``/``fa2``/``fa3`` or ``trtllm-gen``.
+            The implementation backend, could be ``auto``/``fa2``/``fa3`` or ``cutlass``.
             Defaults to ``auto``.
             If set to ``auto``, the wrapper will automatically choose the backend based on the
             device architecture and kernel availability.
@@ -2486,6 +2554,7 @@ def reset_workspace_buffer(
             pin_memory=True,
         )
 
+    @flashinfer_api
     def plan(
         self,
         qo_indptr: torch.Tensor,
@@ -2506,6 +2575,7 @@ def plan(
         rope_theta: Optional[float] = None,
         q_data_type: Union[str, torch.dtype] = "float16",
         kv_data_type: Optional[Union[str, torch.dtype]] = None,
+        o_data_type: Optional[Union[str, torch.dtype]] = None,
         non_blocking: bool = True,
         prefix_len_ptr: Optional[torch.Tensor] = None,
         token_pos_in_items_ptr: Optional[torch.Tensor] = None,
@@ -2530,7 +2600,7 @@ def plan(
             The dimension of the heads on query/key tensor.
         head_dim_vo : Optional[int]
             The dimension of the heads on value/output tensor.
-            If not provided, will be set to ``head_dim_vo``.
+            If not provided, will be set to ``head_dim_qk``.
         custom_mask : Optional[torch.Tensor]
             The flattened boolean mask tensor, shape: ``(sum(q_len[i] * k_len[i] for i in range(batch_size))``.
             The elements in the mask tensor should be either ``True`` or ``False``,
@@ -2580,11 +2650,14 @@ def plan(
             The data type of the query tensor, defaults to torch.float16.
         kv_data_type : Optional[Union[str, torch.dtype]]
             The data type of the key/value tensor. If None, will be set to :attr:`q_data_type`.
+        o_data_type : Optional[Union[str, torch.dtype]]
+            The data type of the output tensor. If None, will be set to :attr:`q_data_type`.
+            For FP8 inputs, this should typically be set to torch.float16.
         non_blocking : bool
             Whether to copy the input tensors to the device asynchronously, defaults to ``True``.
         prefix_len_ptr :Optional[torch.Tensor]
             prefix length. A uint32 1D tensor indicating the prefix length of each prompt. The tensor size is equal to the batch size.
-        token_pos_in_items_ptr : Optional[float]
+        token_pos_in_items_ptr : Optional[torch.Tensor]
             A uint16 1D tensor (it will be converted to uint16 in flashinfer) indicating the token position of each item and started from 0 (delimiter)
             for each item. E.g., if we have 3 items of length 3, 2, 4 respectively for this member. This vector will be looking like
             `[0, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4, 0]` with 4 delimiters indexed as 0. For batch size > 1,
@@ -2594,7 +2667,7 @@ def plan(
             zero padding length for `token_pos_in_items_ptr` to better handle the bsz > 1 case. Still using the above 3,2,4 example.
             If we set `token_pos_in_items_len` to be 20, it will be  `[0, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0]`
             with 7 padded zeros. (note there're 8 zeros in the end where the first one is the delimiter token 0 in the end of the prompt)
-        max_item_len_ptr : Optional[float]
+        max_item_len_ptr : Optional[torch.Tensor]
             a uint16 vector contains the max token length of all items for each prompt
         fixed_split_size : Optional[int],
             The fixed split size for split-kv FA2 prefill/decode, in pages. Recommend setting to the average sequence length of your workload.
@@ -2620,6 +2693,9 @@ def plan(
         if kv_data_type is None:
             kv_data_type = q_data_type
         kv_data_type = canonicalize_torch_dtype(kv_data_type)
+        if o_data_type is None:
+            o_data_type = q_data_type
+        o_data_type = canonicalize_torch_dtype(o_data_type)
         if head_dim_vo is None:
             head_dim_vo = head_dim_qk
         if fixed_split_size is None:
@@ -2692,6 +2768,7 @@ def plan(
 
         self._cached_q_data_type = q_data_type
         self._cached_kv_data_type = kv_data_type
+        self._cached_o_data_type = o_data_type
         kv_len_arr = kv_indptr_host[1:] - kv_indptr_host[:-1]
 
         self._prefix_len_ptr = prefix_len_ptr
@@ -2715,7 +2792,7 @@ def plan(
             get_module_args = (
                 q_data_type,
                 kv_data_type,
-                q_data_type,
+                o_data_type,
                 kv_indptr.dtype,
                 head_dim_qk,
                 head_dim_vo,
@@ -2763,6 +2840,7 @@ def plan(
             if self._backend == "fa2":
                 args.append(fixed_split_size or -1)  # fixed_split_size
                 args.append(disable_split_kv)  # disable_split_kv
+                args.append(0)  # num_colocated_ctas
             self._plan_info = self._cached_module.plan(
                 *args,
             )
@@ -2829,6 +2907,7 @@ def run(
         enable_pdl: Optional[bool] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
+    @flashinfer_api
     def run(
         self,
         q: torch.Tensor,
@@ -2901,11 +2980,17 @@ def run(
                 )
         if out is None:
             out = torch.empty(
-                q.shape[:-1] + v.shape[-1:], dtype=q.dtype, device=q.device
+                q.shape[:-1] + v.shape[-1:],
+                dtype=self._cached_o_data_type,
+                device=q.device,
             )
         else:
             check_shape_dtype_device(
-                out, q.shape[:-1] + v.shape[-1:], q.dtype, q.device, "out"
+                out,
+                q.shape[:-1] + v.shape[-1:],
+                self._cached_o_data_type,
+                q.device,
+                "out",
             )
         if self._backend == "cutlass":
             out, lse = fmha_varlen(
@@ -2923,7 +3008,9 @@ def run(
             )
             return (out, lse) if return_lse else out
 
-        if is_float8(q):
+        # Skip FP8->FP16 conversion for FA3 backend with FP8 support
+        # The JIT module will handle FP8 natively
+        if is_float8(q) and self._backend != "fa3":
             logging.warning(
                 "Our current prefill kernel implementation needs f16 input, the f8 inputs "
                 " are casted to f16, which could result in performance degradation."
@@ -2972,6 +3059,9 @@ def run(
                 rope_theta,
                 self._token_pos_in_items_len,
             ]
+            # For FP8, append scale tensors
+            if is_float8(q):
+                run_args.extend(list(args))  # scale_q, scale_k, scale_v
 
         assert self._cached_module is not None, "cached module is not initialized"
         self._cached_module.ragged_run(*run_args)
@@ -3185,6 +3275,7 @@ def get_trtllm_gen_fmha_module():
     return op
 
 
+@flashinfer_api
 def trtllm_ragged_attention_deepseek(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -3193,8 +3284,8 @@ def trtllm_ragged_attention_deepseek(
     seq_lens: torch.Tensor,
     max_q_len: int,
     max_kv_len: int,
-    bmm1_scale: float,
-    bmm2_scale: float,
+    bmm1_scale: Union[float, torch.Tensor],
+    bmm2_scale: Union[float, torch.Tensor],
     o_sf_scale: float,
     batch_size: int,
     window_left: int,
@@ -3224,10 +3315,12 @@ def trtllm_ragged_attention_deepseek(
         max query length
     max_kv_len : int
         max key/value length
-    bmm1_scale : float
+    bmm1_scale : Union[float, torch.Tensor]
         scale for bmm1, scale_q * scale_k * 1.0 / (head_dim_qk ** 0.5)
-    bmm2_scale : float
+        when using trtllm-gen backend, it can be a torch.Tensor with dtype torch.float32.
+    bmm2_scale : Union[float, torch.Tensor]
         scale for bmm2, scale_v
+        when using trtllm-gen backend, it can be a torch.Tensor with dtype torch.float32.
     o_sf_scale : float
         scale for output
     batch_size : int
@@ -3281,6 +3374,12 @@ def trtllm_ragged_attention_deepseek(
             dtype=torch.float32,
         )
 
+    if isinstance(bmm1_scale, torch.Tensor):
+        assert bmm1_scale.dtype == torch.float32
+        bmm1_scale = bmm1_scale * log2e
+    if isinstance(bmm2_scale, torch.Tensor):
+        assert bmm2_scale.dtype == torch.float32
+
     workspace_size = workspace_buffer.numel() * workspace_buffer.element_size()
     run_func(
         out,
@@ -3311,6 +3410,7 @@ def trtllm_ragged_attention_deepseek(
         return out
 
 
+@flashinfer_api
 def trtllm_batch_context_with_kv_cache(
     query: torch.Tensor,
     kv_cache: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
@@ -3319,8 +3419,8 @@ def trtllm_batch_context_with_kv_cache(
     seq_lens: torch.Tensor,
     max_q_len: int,
     max_kv_len: int,
-    bmm1_scale: float,
-    bmm2_scale: float,
+    bmm1_scale: Union[float, torch.Tensor],
+    bmm2_scale: Union[float, torch.Tensor],
     batch_size: int,
     cum_seq_lens_q: torch.Tensor,
     cum_seq_lens_kv: torch.Tensor,
@@ -3329,6 +3429,7 @@ def trtllm_batch_context_with_kv_cache(
     out_dtype: Optional[Union[torch.dtype, str]] = None,
     o_sf_scale: Optional[float] = None,
     o_sf_vec_size: Optional[int] = None,
+    kv_layout: str = "HND",
     enable_pdl: Optional[bool] = None,
     sinks: Optional[List[torch.Tensor]] = None,
 ) -> Union[torch.Tensor, FP4Tensor]:
@@ -3338,8 +3439,11 @@ def trtllm_batch_context_with_kv_cache(
     query : torch.Tensor
         query tensor with shape [num_tokens, num_heads, head_dim]
     kv_cache : Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
-        If kv_cache is a single tensor, it should be a tensor with shape [num_pages, 1 or 2, num_kv_heads, page_size, head_dim]
-        If kv_cache is a tuple of two tensors, it should be a tuple of two tensors with shape [num_pages, num_kv_heads, page_size, head_dim]
+        If kv_cache is a single tensor, it should be a tensor with shape [num_pages, 1 or 2, num_kv_heads, page_size, head_dim] if :attr:`kv_layout` is "HND",
+        or [num_pages, 1 or 2, page_size, num_kv_heads, head_dim] if :attr:`kv_layout` is "NHD".
+        If kv_cache is a tuple of two tensors, it should be a tuple of two tensors with shape [num_pages, num_kv_heads, page_size, head_dim] if :attr:`kv_layout` is "HND",
+        or [num_pages, page_size, num_kv_heads, head_dim] if :attr:`kv_layout` is "NHD".
+        The first tensor is the key cache, the second tensor is the value cache.
     workspace_buffer : torch.Tensor. Must be initialized to 0 for its first use.
         workspace
     block_tables : torch.Tensor
@@ -3350,10 +3454,12 @@ def trtllm_batch_context_with_kv_cache(
         max sequence length for query
     max_kv_len : int
         max sequence length for kv_cache
-    bmm1_scale : float
+    bmm1_scale : Union[float, torch.Tensor]
         fused scale for bmm1 input.
-    bmm2_scale : float
+        when using trtllm-gen backend, it can be a torch.Tensor with dtype torch.float32.
+    bmm2_scale : Union[float, torch.Tensor]
         fused scale for bmm2 input.
+        when using trtllm-gen backend, it can be a torch.Tensor with dtype torch.float32.
     batch_size : int
         batch size
     cum_seq_lens_q : torch.Tensor
@@ -3371,6 +3477,11 @@ def trtllm_batch_context_with_kv_cache(
         scale for nvfp4 output tensor scale factor.
     o_sf_vec_size : Optional[int] = None
         vector size for nvfp4 output tensor scale factor.
+    enable_pdl : Optional[bool] = None
+        Whether to enable Programmatic Dependent Launch (PDL). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#programmatic-dependent-launch-and-synchronization
+        Defaults to ``None``, which means it will be enabled if the device supports PDL.
+    kv_layout : str = "HND"
+        Layout of kv-cache, can be "HND" or "NHD", default is "HND".
     sinks : Optional[List[torch.Tensor]] = None
         additional value per head in the denominator of the softmax.
 
@@ -3396,6 +3507,12 @@ def trtllm_batch_context_with_kv_cache(
             # it doesn't change underlying storage
             k_cache, v_cache = kv_cache.unbind(dim=1)
 
+    # Convert NHD layout to HND if necessary (transpose only changes stride, not data)
+    if kv_layout == "NHD":
+        # For NHD: [..., N, H, D] -> HND: [..., H, N, D]
+        k_cache = k_cache.transpose(-3, -2)
+        v_cache = v_cache.transpose(-3, -2)
+
     run_func = get_trtllm_gen_fmha_module().trtllm_paged_attention_context
     sm_count = get_device_sm_count(query.device)
 
@@ -3471,13 +3588,11 @@ def trtllm_batch_context_with_kv_cache(
     else:
         raise ValueError(f"Invalid out_dtype: {out_dtype}")
 
-    bmm1_scale = (
-        bmm1_scale.item() if isinstance(bmm1_scale, torch.Tensor) else bmm1_scale
-    )
-    bmm2_scale = (
-        bmm2_scale.item() if isinstance(bmm2_scale, torch.Tensor) else bmm2_scale
-    )
-
+    if isinstance(bmm1_scale, torch.Tensor):
+        assert bmm1_scale.dtype == torch.float32
+        bmm1_scale = bmm1_scale * log2e
+    if isinstance(bmm2_scale, torch.Tensor):
+        assert bmm2_scale.dtype == torch.float32
     workspace_size = workspace_buffer.numel() * workspace_buffer.element_size()
     run_func(
         out,
@@ -3509,3 +3624,86 @@ def trtllm_batch_context_with_kv_cache(
         if out_dtype != "nvfp4"
         else FP4Tensor(out, out_scale_factor, o_sf_start_index, query.shape)
     )
+
+
+@flashinfer_api
+def fmha_v2_prefill_deepseek(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    num_heads: int,
+    head_dim: int,
+    seq_len: int,
+    scale_softmax: float,
+    scale_bmm1: Optional[float] = None,
+    scale_bmm2: Optional[float] = None,
+    return_lse: bool = False,
+    lse: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    """
+    Parameters
+    ----------
+    query : torch.Tensor
+        query tensor with shape [batch_size, seq_len, num_heads, head_dim]
+    key : torch.Tensor
+        key tensor with shape [batch_size, seq_len, num_heads, head_dim]
+    value : torch.Tensor
+        value tensor with shape [batch_size, seq_len, num_heads, head_dim]
+    out : torch.Tensor
+        output tensor with shape [batch_size, seq_len, num_heads, head_dim]
+    return_lse : bool
+        whether to return the log-sum-exp of attention output
+    num_heads : int
+        number of heads
+    head_dim : int
+        head dimension
+    seq_len : int
+        sequence length
+    scale_softmax : float
+        scale for softmax
+    scale_bmm1 : Optional[float]
+        scale for bmm1
+    scale_bmm2 : Optional[float]
+        scale for bmm2
+    lse : Optional[torch.Tensor]
+        log-sum-exp of attention output
+    Returns
+    -------
+    out: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
+        output torch.Tensor or Tuple[torch.Tensor, torch.Tensor].
+        If return_lse is True, the output will be a tuple of two tensors, the first is the output tensor, the second is the lse tensor.
+        If return_lse is False, the output will be a single tensor.
+    """
+    if not is_sm120a_supported(query.device):
+        raise ValueError("fmha_v2_prefill_deepseek is only supported on SM120 GPUs.")
+    assert query.shape[3] == 192 and key.shape[3] == 192 and value.shape[3] == 128, (
+        "currently only support deepseek r1 192 query and 128 value"
+    )
+    module = get_trtllm_fmha_v2_module()
+    is_e4m3 = query.dtype == torch.float8_e4m3fn
+    is_bf16_output = out.dtype == torch.bfloat16
+    scale_softmax = (
+        scale_softmax if scale_softmax is not None else 1.0 if is_e4m3 else 0.0
+    )
+    scale_bmm1 = scale_bmm1 if scale_bmm1 is not None else 1.0
+    scale_bmm2 = scale_bmm2 if scale_bmm2 is not None else 1.0
+    module.run(
+        query,
+        key,
+        value,
+        out,
+        lse,
+        num_heads,
+        head_dim,
+        seq_len,
+        scale_softmax,
+        scale_bmm1,
+        scale_bmm2,
+        is_e4m3,
+        is_bf16_output,
+    )
+    if return_lse:
+        return out, lse
+    else:
+        return out
diff --git a/flashinfer/quantization.py b/flashinfer/quantization.py
index 810b1f2ae1..4e279ab5f0 100644
--- a/flashinfer/quantization.py
+++ b/flashinfer/quantization.py
@@ -19,6 +19,7 @@
 
 import torch
 
+from .api_logging import flashinfer_api
 from .jit.quantization import gen_quantization_module
 from .utils import register_custom_op, register_fake_op
 
@@ -42,6 +43,7 @@ def _fake_packbits(x: torch.Tensor, bitorder: str) -> torch.Tensor:
     return torch.empty((x.size(0) + 7) // 8, dtype=torch.uint8, device=x.device)
 
 
+@flashinfer_api
 def packbits(x: torch.Tensor, bitorder: str = "big") -> torch.Tensor:
     r"""Pack the elements of a binary-valued array into bits in a uint8 array.
 
@@ -76,6 +78,7 @@ def packbits(x: torch.Tensor, bitorder: str = "big") -> torch.Tensor:
     return _packbits(x, bitorder)
 
 
+@flashinfer_api
 def segment_packbits(
     x: torch.Tensor, indptr: torch.Tensor, bitorder: str = "big"
 ) -> Tuple[torch.Tensor, torch.Tensor]:
diff --git a/flashinfer/rope.py b/flashinfer/rope.py
index 7884c439be..1d069e3189 100644
--- a/flashinfer/rope.py
+++ b/flashinfer/rope.py
@@ -19,6 +19,7 @@
 
 import torch
 
+from .api_logging import flashinfer_api
 from .jit.rope import gen_rope_module
 from .utils import register_custom_op, register_fake_op
 
@@ -226,6 +227,105 @@ def _fake_rope_quantize(
     pass
 
 
+@register_custom_op(
+    "flashinfer::rope_quantize_append_paged_kv_cache",
+    mutates_args=(
+        "q_rope_out",
+        "q_nope_out",
+        "k_cache",
+        "v_cache",
+        "ckv_cache",
+        "kpe_cache",
+    ),
+)
+def _rope_quantize_fp8_append_paged_kv_cache(
+    q_rope_in: torch.Tensor,
+    k_rope_in: torch.Tensor,
+    q_nope_in: torch.Tensor,
+    k_nope_in: torch.Tensor,
+    v_in: torch.Tensor,
+    q_rope_out: torch.Tensor,
+    q_nope_out: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    pos_ids: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    ckv_cache: torch.Tensor,
+    kpe_cache: torch.Tensor,
+    kv_indices: torch.Tensor,
+    kv_indptr: torch.Tensor,
+    batch_indices: torch.Tensor,
+    positions: torch.Tensor,
+    kv_layout_code: int,
+    page_size: int,
+    quant_scale_q: float,
+    quant_scale_kv: float,
+    interleave: bool,
+    enable_pdl: bool,
+) -> None:
+    r"""Custom operator that routes to the CUDA kernel implementation.
+
+    Fuses RoPE application, FP8 quantization, and paged KV cache append into a single kernel.
+
+    Converts is_neox parameter to interleave format and dispatches to the underlying
+    CUDA kernel via the JIT-compiled module.
+    """
+    get_rope_module().rope_quantize_append_paged_kv_cache(
+        q_rope_in,
+        k_rope_in,
+        q_nope_in,
+        k_nope_in,
+        v_in,
+        q_rope_out,
+        q_nope_out,
+        cos_sin_cache,
+        pos_ids,
+        k_cache,
+        v_cache,
+        ckv_cache,
+        kpe_cache,
+        kv_indices,
+        kv_indptr,
+        batch_indices,
+        positions,
+        kv_layout_code,
+        page_size,
+        quant_scale_q,
+        quant_scale_kv,
+        interleave,
+        enable_pdl,
+    )
+
+
+@register_fake_op("flashinfer::rope_quantize_append_paged_kv_cache")
+def _fake_rope_quantize_fp8_append_paged_kv_cache(
+    q_rope_in: torch.Tensor,
+    k_rope_in: torch.Tensor,
+    q_nope_in: torch.Tensor,
+    k_nope_in: torch.Tensor,
+    v_in: torch.Tensor,
+    q_rope_out: torch.Tensor,
+    q_nope_out: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    pos_ids: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    ckv_cache: torch.Tensor,
+    kpe_cache: torch.Tensor,
+    kv_indices: torch.Tensor,
+    kv_indptr: torch.Tensor,
+    batch_indices: torch.Tensor,
+    positions: torch.Tensor,
+    kv_layout_code: int,
+    page_size: int,
+    quant_scale_q: float,
+    quant_scale_kv: float,
+    interleave: bool,
+    enable_pdl: bool,
+) -> None:
+    pass
+
+
 @register_custom_op(
     "flashinfer::apply_rope_pos_ids_cos_sin_cache", mutates_args=("q_rope", "k_rope")
 )
@@ -314,6 +414,7 @@ def _fake_apply_llama31_rope_pos_ids(
     pass
 
 
+@flashinfer_api
 def apply_rope_inplace(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -401,6 +502,7 @@ def apply_rope_inplace(
     )
 
 
+@flashinfer_api
 def apply_rope_pos_ids_inplace(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -459,6 +561,7 @@ def apply_rope_pos_ids_inplace(
     )
 
 
+@flashinfer_api
 def apply_llama31_rope_inplace(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -567,6 +670,7 @@ def apply_llama31_rope_inplace(
     )
 
 
+@flashinfer_api
 def apply_llama31_rope_pos_ids_inplace(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -645,6 +749,7 @@ def apply_llama31_rope_pos_ids_inplace(
     )
 
 
+@flashinfer_api
 def apply_rope(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -755,6 +860,7 @@ def apply_rope(
     return q_rope, k_rope
 
 
+@flashinfer_api
 def apply_rope_pos_ids(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -823,6 +929,7 @@ def apply_rope_pos_ids(
     return q_rope, k_rope
 
 
+@flashinfer_api
 def apply_llama31_rope(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -945,6 +1052,7 @@ def apply_llama31_rope(
     return q_rope, k_rope
 
 
+@flashinfer_api
 def apply_llama31_rope_pos_ids(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -1032,6 +1140,7 @@ def apply_llama31_rope_pos_ids(
     return q_rope, k_rope
 
 
+@flashinfer_api
 def apply_rope_with_cos_sin_cache(
     positions: torch.Tensor,
     query: torch.Tensor,
@@ -1095,6 +1204,7 @@ def apply_rope_with_cos_sin_cache(
     return query_out, key_out
 
 
+@flashinfer_api
 def apply_rope_with_cos_sin_cache_inplace(
     positions: torch.Tensor,
     query: torch.Tensor,
@@ -1147,6 +1257,7 @@ def apply_rope_with_cos_sin_cache_inplace(
     )
 
 
+@flashinfer_api
 def mla_rope_quantize_fp8(
     q_rope: torch.Tensor,
     k_rope: torch.Tensor,
@@ -1183,11 +1294,12 @@ def mla_rope_quantize_fp8(
     )
 
 
+@flashinfer_api
 def rope_quantize_fp8(
     q_rope: torch.Tensor,
     k_rope: torch.Tensor,
-    q_nope: torch.Tensor,
-    k_nope: torch.Tensor,
+    q_nope: Optional[torch.Tensor],
+    k_nope: Optional[torch.Tensor],
     cos_sin_cache: torch.Tensor,
     pos_ids: torch.Tensor,
     is_neox: bool = True,
@@ -1214,12 +1326,12 @@ def rope_quantize_fp8(
     k_rope : torch.Tensor
         Key tensor (rotary dimensions). For GQA/MHA: ``(nnz, num_kv_heads, rope_dim)``.
         For MLA: ``(nnz, rope_dim)``. Must be float16 or bfloat16.
-    q_nope : torch.Tensor
+    q_nope : Optional[torch.Tensor]
         Query tensor (non-rotary dimensions), shape: ``(nnz, num_qo_heads, no_rope_dim)``.
-        Must be float16 or bfloat16.
-    k_nope : torch.Tensor
+        If ``None``, treated as zero-dim: a size-0 tensor will be created internally.
+    k_nope : Optional[torch.Tensor]
         Key tensor (non-rotary dimensions). For GQA/MHA: ``(nnz, num_kv_heads, no_rope_dim)``.
-        For MLA: ``(nnz, no_rope_dim)``. Must be float16 or bfloat16.
+        For MLA: ``(nnz, no_rope_dim)``. If ``None``, treated as zero-dim and created internally.
     cos_sin_cache : torch.Tensor
         Precomputed cosine and sine values, shape: ``(max_seq_len, rope_dim)``.
         First half contains cosine values, second half contains sine values. Must be float32.
@@ -1254,6 +1366,23 @@ def rope_quantize_fp8(
     if cos_sin_cache.dtype != torch.float32:
         raise ValueError("cos_sin_cache should be float32")
 
+    # Allow None for nope tensors and normalize to size-0 tensors with correct shapes
+    nnz = q_rope.shape[0]
+    num_qo_heads = q_rope.shape[1]
+    is_mla = k_rope.ndim == 2
+    num_kv_heads = 1 if is_mla else k_rope.shape[1]
+    if q_nope is None:
+        q_nope = torch.empty(
+            nnz, num_qo_heads, 0, dtype=q_rope.dtype, device=q_rope.device
+        )
+    if k_nope is None:
+        if is_mla:
+            k_nope = torch.empty(nnz, 0, dtype=k_rope.dtype, device=k_rope.device)
+        else:
+            k_nope = torch.empty(
+                nnz, num_kv_heads, 0, dtype=k_rope.dtype, device=k_rope.device
+            )
+
     # Infer quantize_dtype from output tensors or default to float8_e4m3fn
     if quantize_dtype is None:
         for out in (q_rope_out, k_rope_out, q_nope_out, k_nope_out):
@@ -1303,3 +1432,240 @@ def rope_quantize_fp8(
     )
 
     return q_rope_out, k_rope_out, q_nope_out, k_nope_out
+
+
+@flashinfer_api
+def rope_quantize_fp8_append_paged_kv_cache(
+    q_rope: torch.Tensor,
+    k_rope: torch.Tensor,
+    q_nope: Optional[torch.Tensor],
+    k_nope: Optional[torch.Tensor],
+    v: Optional[torch.Tensor],
+    cos_sin_cache: torch.Tensor,
+    pos_ids: torch.Tensor,
+    paged_kv_cache: Tuple[torch.Tensor, torch.Tensor],
+    kv_indices: torch.Tensor,
+    kv_indptr: torch.Tensor,
+    batch_indices: torch.Tensor,
+    positions: torch.Tensor,
+    is_neox: bool = True,
+    quantize_dtype: Optional[torch.dtype] = None,
+    quant_scale_q: float = 1.0,
+    quant_scale_kv: float = 1.0,
+    page_size: int = 16,
+    kv_layout: str = "NHD",
+    q_rope_out: Optional[torch.Tensor] = None,
+    q_nope_out: Optional[torch.Tensor] = None,
+    enable_pdl: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    r"""Apply RoPE (Rotary Positional Embeddings), quantize to FP8, and append K/V to paged cache.
+
+    This fused function applies RoPE to query/key (Q/K) rotary dimension tensors, quantizes all Q/K tensors
+    (and V for GQA/MHA) to FP8 format, and directly appends the quantized K/V to a paged KV cache.
+    It returns quantized Q tensors for use in attention computation. Supports MLA, GQA, and MHA
+    architectures with automatic detection based on input tensor shapes.
+
+    Parameters
+    ----------
+    q_rope : torch.Tensor
+        Query tensor (rotary dimensions), shape: ``(nnz, num_qo_heads, rope_dim)``.
+        Must be float16 or bfloat16.
+    k_rope : torch.Tensor
+        Key tensor (rotary dimensions). For GQA/MHA: ``(nnz, num_kv_heads, rope_dim)``.
+        For MLA: ``(nnz, rope_dim)``. Must be float16 or bfloat16.
+    q_nope : torch.Tensor
+        Query tensor (non-rotary dimensions), shape: ``(nnz, num_qo_heads, no_rope_dim)``.
+        Must be float16 or bfloat16.
+    k_nope : torch.Tensor
+        Key tensor (non-rotary dimensions). For GQA/MHA: ``(nnz, num_kv_heads, no_rope_dim)``.
+        For MLA: ``(nnz, no_rope_dim)``. Must be float16 or bfloat16.
+    v : Optional[torch.Tensor]
+        Value tensor for GQA/MHA: ``(nnz, num_kv_heads, head_dim)``. Must be float16 or bfloat16.
+        For MLA: pass ``None`` (MLA does not use separate V; K non-RoPE acts as compressed KV).
+    cos_sin_cache : torch.Tensor
+        Precomputed cosine and sine values, shape: ``(max_seq_len, rope_dim)``.
+        First half contains cosine values, second half contains sine values. Must be float32.
+    pos_ids : torch.Tensor
+        Position indices for each token, shape: ``(nnz,)``.
+    paged_kv_cache : Tuple[torch.Tensor, torch.Tensor]
+        For MLA: ``(ckv_cache, kpe_cache)`` where:
+            - ckv_cache: ``(max_pages, page_size, no_rope_dim)`` in FP8
+            - kpe_cache: ``(max_pages, page_size, rope_dim)`` in FP8
+        For GQA/MHA: ``(k_cache, v_cache)`` where:
+            - k_cache: ``(max_pages, page_size, num_kv_heads, head_dim)`` or
+              ``(max_pages, num_kv_heads, page_size, head_dim)`` depending on layout, in FP8
+            - v_cache: same shape as k_cache, in FP8
+    kv_indices : torch.Tensor
+        Page indices mapping, shape: ``(total_pages,)``. Typically ``torch.arange(total_pages)``.
+    kv_indptr : torch.Tensor
+        Page indptr array for each request, shape: ``(batch_size + 1,)``.
+        ``kv_indptr[i]`` is the starting page index for request ``i``.
+    batch_indices : torch.Tensor
+        Batch index for each token, shape: ``(nnz,)``. Maps each token to its request.
+    positions : torch.Tensor
+        Position within each request's sequence for each token, shape: ``(nnz,)``.
+    is_neox : bool
+        RoPE layout style. If ``True`` (default), use non-interleaved layout (first/second half).
+        If ``False``, use interleaved layout (even/odd dimensions).
+    quantize_dtype : Optional[torch.dtype]
+        Target quantization dtype. If ``None``, inferred from output tensors or defaults to
+        ``torch.float8_e4m3fn``. Must be ``torch.float8_e4m3fn`` or ``torch.float8_e5m2``.
+    quant_scale_q : float
+        Quantization scaling factor for query tensors, default: ``1.0``.
+    quant_scale_kv : float
+        Quantization scaling factor for key/value tensors, default: ``1.0``.
+    page_size : int
+        Number of entries per page in the paged cache, default: ``16``.
+    kv_layout : str
+        Cache memory layout for GQA/MHA. Options: ``"NHD"`` (page, seq, head, dim) or
+        ``"HND"`` (page, head, seq, dim). Default: ``"NHD"``. Ignored for MLA.
+    q_rope_out : Optional[torch.Tensor]
+        Pre-allocated output tensor for quantized query (rotary). If ``None``, allocated automatically.
+    q_nope_out : Optional[torch.Tensor]
+        Pre-allocated output tensor for quantized query (non-rotary). If ``None``, allocated automatically.
+    enable_pdl : bool
+        Whether to enable PDL (Programmatic Dependent Launch). Default: ``False``.
+
+    Returns
+    -------
+    Tuple[torch.Tensor, torch.Tensor]
+        Quantized query tensors: (q_rope_out, q_nope_out).
+        K/V are written directly to the paged cache and not returned.
+
+    Notes
+    -----
+    - Architecture detection: Automatically distinguishes MLA (2D K tensors) from GQA/MHA (3D K tensors).
+    - MLA writes K-RoPE to ``kpe_cache`` and K-noRoPE to ``ckv_cache``; V is not used.
+    - GQA/MHA writes full K (RoPE+noRoPE) to ``k_cache`` and V to ``v_cache``.
+    - The ``batch_indices`` and ``positions`` tensors are typically obtained from
+      ``flashinfer.get_batch_indices_positions()``.
+    - Cache tensors must already be allocated in the target FP8 dtype.
+    """
+    if cos_sin_cache.dtype != torch.float32:
+        raise ValueError("cos_sin_cache should be float32")
+
+    # Detect architecture
+    is_mla = k_rope.ndim == 2
+
+    # Allow None for nope tensors and normalize to size-0 tensors with correct shapes
+    nnz = q_rope.shape[0]
+    num_qo_heads = q_rope.shape[1]
+    if q_nope is None:
+        q_nope = torch.empty(
+            nnz, num_qo_heads, 0, dtype=q_rope.dtype, device=q_rope.device
+        )
+    if k_nope is None:
+        if is_mla:
+            k_nope = torch.empty(nnz, 0, dtype=k_rope.dtype, device=k_rope.device)
+        else:
+            num_kv_heads = k_rope.shape[1]
+            k_nope = torch.empty(
+                nnz, num_kv_heads, 0, dtype=k_rope.dtype, device=k_rope.device
+            )
+
+    # Infer quantize_dtype from output tensors or default
+    if quantize_dtype is None:
+        if q_rope_out is not None:
+            quantize_dtype = q_rope_out.dtype
+        elif q_nope_out is not None:
+            quantize_dtype = q_nope_out.dtype
+        else:
+            quantize_dtype = torch.float8_e4m3fn
+
+    # Allocate Q output tensors if not provided
+    if q_rope_out is None:
+        q_rope_out = torch.empty_like(q_rope, dtype=quantize_dtype)
+    if q_nope_out is None:
+        q_nope_out = torch.empty_like(q_nope, dtype=quantize_dtype)
+
+    # Handle MLA normalization and V (create empty dummy tensor, not used)
+    if is_mla:
+        # Normalize MLA K tensors to 3D (nnz, 1, dim) so C++ binding can always assume 3D
+        if k_rope.ndim == 2:
+            k_rope = k_rope.unsqueeze(1)
+        if k_nope.ndim == 2:
+            k_nope = k_nope.unsqueeze(1)
+        if v is None:
+            v = torch.empty(0, dtype=q_rope.dtype, device=q_rope.device)
+        else:
+            raise ValueError("MLA should not have V input (pass None)")
+
+    # Unpack and validate cache tensors
+    if len(paged_kv_cache) != 2:
+        raise ValueError("paged_kv_cache must be a tuple of 2 tensors")
+
+    cache_0, cache_1 = paged_kv_cache
+
+    if is_mla:
+        # MLA: Expect (ckv_cache, kpe_cache)
+        ckv_cache = cache_0
+        kpe_cache = cache_1
+        if ckv_cache.dtype != quantize_dtype or kpe_cache.dtype != quantize_dtype:
+            raise ValueError(
+                f"MLA cache dtype mismatch: expected {quantize_dtype}, "
+                f"got ckv={ckv_cache.dtype}, kpe={kpe_cache.dtype}"
+            )
+        if ckv_cache.ndim != 3 or kpe_cache.ndim != 3:
+            raise ValueError(
+                f"MLA cache must be 3D: (max_pages, page_size, dim), "
+                f"got ckv={ckv_cache.ndim}D, kpe={kpe_cache.ndim}D"
+            )
+        # Create dummy tensors for GQA/MHA cache (not used)
+        k_cache = torch.empty(0, dtype=quantize_dtype, device=q_rope.device)
+        v_cache = torch.empty(0, dtype=quantize_dtype, device=q_rope.device)
+    else:
+        # GQA/MHA: Expect (k_cache, v_cache)
+        k_cache = cache_0
+        v_cache = cache_1
+        # Validate V input is provided for GQA/MHA
+        if v is None:
+            raise ValueError(
+                "GQA/MHA expects a V tensor, but got None. "
+                "Only MLA uses None for V (compressed KV representation)."
+            )
+        if k_cache.dtype != quantize_dtype or v_cache.dtype != quantize_dtype:
+            raise ValueError(
+                f"GQA/MHA cache dtype mismatch: expected {quantize_dtype}, "
+                f"got k={k_cache.dtype}, v={v_cache.dtype}"
+            )
+        if k_cache.ndim != 4 or v_cache.ndim != 4:
+            raise ValueError(
+                f"GQA/MHA cache must be 4D, got k={k_cache.ndim}D, v={v_cache.ndim}D"
+            )
+        # Create dummy tensors for MLA cache (not used)
+        ckv_cache = torch.empty(0, dtype=quantize_dtype, device=q_rope.device)
+        kpe_cache = torch.empty(0, dtype=quantize_dtype, device=q_rope.device)
+
+    # Import TensorLayout enum
+    from .utils import TensorLayout
+
+    kv_layout_code = TensorLayout[kv_layout].value
+
+    # Call custom op
+    _rope_quantize_fp8_append_paged_kv_cache(
+        q_rope,
+        k_rope,
+        q_nope,
+        k_nope,
+        v,
+        q_rope_out,
+        q_nope_out,
+        cos_sin_cache,
+        pos_ids,
+        k_cache,
+        v_cache,
+        ckv_cache,
+        kpe_cache,
+        kv_indices,
+        kv_indptr,
+        batch_indices,
+        positions,
+        kv_layout_code,
+        page_size,
+        quant_scale_q,
+        quant_scale_kv,
+        not is_neox,  # interleave
+        enable_pdl,
+    )
+
+    return q_rope_out, q_nope_out
diff --git a/flashinfer/sampling.py b/flashinfer/sampling.py
index 3ac6367ff5..ec27ea2986 100644
--- a/flashinfer/sampling.py
+++ b/flashinfer/sampling.py
@@ -16,9 +16,10 @@
 
 import functools
 from types import SimpleNamespace
-from typing import Any, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 import torch
 
+from .api_logging import flashinfer_api
 from .jit.sampling import gen_sampling_module
 from .utils import (
     _get_cache_buf,
@@ -88,6 +89,8 @@ def sampling_from_logits(
         indices: Optional[torch.Tensor],
         deterministic: bool,
         generator: Optional[torch.Generator],
+        seed: Optional[int] = None,
+        offset: Optional[int] = None,
     ) -> torch.Tensor:
         device = logits.device
         # TODO: support more data types in logits to avoid conversion
@@ -95,7 +98,8 @@ def sampling_from_logits(
         logits = logits.float()
         batch_size = indices.size(0) if indices is not None else logits.size(0)
         samples = torch.empty(batch_size, dtype=torch.int32, device=device)
-        seed, offset = get_seed_and_offset(batch_size * logits.size(1), generator)
+        if seed is None or offset is None:
+            seed, offset = get_seed_and_offset(batch_size * logits.size(1), generator)
         module.sampling_from_logits(
             logits,
             samples,
@@ -124,12 +128,15 @@ def sampling_from_probs(
         indices: Optional[torch.Tensor],
         deterministic: bool,
         generator: Optional[torch.Generator],
+        seed: Optional[int] = None,
+        offset: Optional[int] = None,
     ) -> torch.Tensor:
         device = probs.device
         probs = probs.float()
         batch_size = indices.size(0) if indices is not None else probs.size(0)
         samples = torch.empty(batch_size, dtype=torch.int32, device=device)
-        seed, offset = get_seed_and_offset(batch_size, generator)
+        if seed is None or offset is None:
+            seed, offset = get_seed_and_offset(batch_size, generator)
         module.sampling_from_probs(
             probs,
             samples,
@@ -162,6 +169,8 @@ def top_p_sampling_from_probs(
         top_p_val: float,
         deterministic: bool,
         generator: Optional[torch.Generator],
+        seed: Optional[int] = None,
+        offset: Optional[int] = None,
     ) -> torch.Tensor:
         device = probs.device
         probs = probs.float()
@@ -170,7 +179,8 @@ def top_p_sampling_from_probs(
         )
         batch_size = indices.size(0) if indices is not None else probs.size(0)
         samples = torch.empty(batch_size, dtype=torch.int32, device=device)
-        seed, offset = get_seed_and_offset(batch_size * 32, generator)
+        if seed is None or offset is None:
+            seed, offset = get_seed_and_offset(batch_size * 32, generator)
         module.top_p_sampling_from_probs(
             probs,
             samples,
@@ -205,13 +215,16 @@ def top_k_sampling_from_probs(
         top_k_val: int,
         deterministic: bool,
         generator: Optional[torch.Generator],
+        seed: Optional[int] = None,
+        offset: Optional[int] = None,
     ) -> torch.Tensor:
         device = probs.device
         probs = probs.float()
         batch_size = indices.size(0) if indices is not None else probs.size(0)
         maybe_top_k_arr = maybe_top_k_arr.int() if maybe_top_k_arr is not None else None
         samples = torch.empty(batch_size, dtype=torch.int32, device=device)
-        seed, offset = get_seed_and_offset(batch_size * 32, generator)
+        if seed is None or offset is None:
+            seed, offset = get_seed_and_offset(batch_size * 32, generator)
         module.top_k_sampling_from_probs(
             probs,
             samples,
@@ -247,6 +260,8 @@ def min_p_sampling_from_probs(
         min_p_val: float,
         deterministic: bool,
         generator: Optional[torch.Generator],
+        seed: Optional[int] = None,
+        offset: Optional[int] = None,
     ) -> torch.Tensor:
         device = probs.device
         probs = probs.float()
@@ -255,7 +270,8 @@ def min_p_sampling_from_probs(
         )
         batch_size = indices.size(0) if indices is not None else probs.size(0)
         samples = torch.empty(batch_size, dtype=torch.int32, device=device)
-        seed, offset = get_seed_and_offset(batch_size, generator)
+        if seed is None or offset is None:
+            seed, offset = get_seed_and_offset(batch_size, generator)
         module.min_p_sampling_from_probs(
             probs,
             samples,
@@ -280,6 +296,8 @@ def top_k_top_p_sampling_from_probs(
         top_p_val: float,
         deterministic: bool,
         generator: Optional[torch.Generator],
+        seed: Optional[int] = None,
+        offset: Optional[int] = None,
     ) -> torch.Tensor:
         device = probs.device
         probs = probs.float()
@@ -289,7 +307,8 @@ def top_k_top_p_sampling_from_probs(
         )
         batch_size = indices.size(0) if indices is not None else probs.size(0)
         samples = torch.empty(batch_size, dtype=torch.int32, device=device)
-        seed, offset = get_seed_and_offset(batch_size * 32, generator)
+        if seed is None or offset is None:
+            seed, offset = get_seed_and_offset(batch_size * 32, generator)
         module.top_k_top_p_sampling_from_probs(
             probs,
             samples,
@@ -419,6 +438,8 @@ def chain_speculative_sampling(
         output_emitted_draft_token_num: torch.Tensor,
         deterministic: bool,
         generator: Optional[torch.Generator],
+        seed: Optional[int] = None,
+        offset: Optional[int] = None,
     ) -> torch.Tensor:
         device = draft_probs.device
         draft_probs = draft_probs.float()
@@ -428,9 +449,10 @@ def chain_speculative_sampling(
         output_emitted_draft_token_num = output_emitted_draft_token_num.int()
         b, n = draft_token_ids.shape
         output_token_ids = torch.empty((b, n + 1), dtype=torch.int32, device=device)
-        seed, offset = get_seed_and_offset(
-            draft_probs.size(0) * (draft_probs.size(1) + 1), generator
-        )
+        if seed is None or offset is None:
+            seed, offset = get_seed_and_offset(
+                draft_probs.size(0) * (draft_probs.size(1) + 1), generator
+            )
         module.chain_speculative_sampling(
             draft_probs,
             draft_token_ids,
@@ -481,27 +503,7 @@ def _to_tensor_scalar_tuple(x):
         return (None, x)
 
 
-def _check_tensor_param(param: Any, tensor: torch.Tensor) -> None:
-    """Validate sampling parameters."""
-    if isinstance(param, torch.Tensor):
-        if param.dim() == 0:
-            raise ValueError(
-                f"Expected a 1D tensor of shape (batch_size,) or scalar for the sampling parameter, "
-                f"but got a 0-dimensional tensor with shape {param.shape}. "
-            )
-        elif param.dim() > 1:
-            raise ValueError(
-                f"Expected a 1D tensor or scalar for the sampling parameter, "
-                f"but got a {param.dim()}D tensor with shape {param.shape}. "
-            )
-        elif param.shape[0] != tensor.shape[0]:
-            raise ValueError(
-                f"Sampling parameter tensor batch size mismatch: "
-                f"expected length {tensor.shape[0]} to match the reference tensor batch size, "
-                f"but got length {param.shape[0]} with shape {param.shape}."
-            )
-
-
+@flashinfer_api
 def softmax(
     logits: torch.Tensor,
     temperature: Optional[Union[torch.Tensor, float]] = None,
@@ -559,12 +561,15 @@ def softmax(
     )
 
 
+@flashinfer_api
 def sampling_from_logits(
     logits: torch.Tensor,
     indices: Optional[torch.Tensor] = None,
     deterministic: bool = True,
     generator: Optional[torch.Generator] = None,
     check_nan: bool = False,
+    seed: Optional[int] = None,
+    offset: Optional[int] = None,
 ) -> torch.Tensor:
     r"""Fused GPU kernel for category sampling from logits. It's equivalent to sampling
     from :attr:`logits` after applying softmax.
@@ -576,7 +581,7 @@ def sampling_from_logits(
         shape should be ``(unique_batch_size, num_classes)`` where unique_batch_size is the number of unique
         probability distributions.
     indices: Optional[torch.Tensor]
-        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in logits.
+        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` that maps each output to a row in logits.
         For example, if indices[i] = j, then the i-th output will be sampled from logits[j].
         This allows reusing the same probability distribution for multiple outputs.
         If indices is not provided, the i-th output will be sampled from the i-th row of logits.
@@ -587,6 +592,10 @@ def sampling_from_logits(
         A random number generator for the operation.
     check_nan: bool
         Whether to check nan in :attr:`logits`, default is ``False``.
+    seed: Optional[int]
+        seed value to use for the rng during the sampling operation.
+    offset: Optional[int]
+        offset value to use for the rng during the sampling operation.
     Returns
     -------
     samples: torch.Tensor
@@ -613,16 +622,19 @@ def sampling_from_logits(
         if torch.any(torch.isnan(logits)):
             raise ValueError("Input logits contains NaN.")
     return get_sampling_module().sampling_from_logits(
-        logits, indices, deterministic, generator
+        logits, indices, deterministic, generator, seed, offset
     )
 
 
+@flashinfer_api
 def sampling_from_probs(
     probs: torch.Tensor,
     indices: Optional[torch.Tensor] = None,
     deterministic: bool = True,
     generator: Optional[torch.Generator] = None,
     check_nan: bool = False,
+    seed: Optional[int] = None,
+    offset: Optional[int] = None,
 ) -> torch.Tensor:
     r"""Fused GPU kernel for category sampling from probabilities.
 
@@ -634,7 +646,7 @@ def sampling_from_probs(
         shape should be ``(unique_batch_size, num_classes)`` where unique_batch_size is the number of unique
         probability distributions.
     indices: Optional[torch.Tensor]
-        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in probs.
+        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` that maps each output to a row in probs.
         For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
         This allows reusing the same probability distribution for multiple outputs.
         If indices is not provided, the i-th output will be sampled from the i-th row of probs.
@@ -644,6 +656,10 @@ def sampling_from_probs(
         A random number generator for the operation.
     check_nan: bool
         Whether to check nan in :attr:`probs`, default is ``False``.
+    seed: Optional[int]
+        seed value to use for the rng during the sampling operation.
+    offset: Optional[int]
+        offset value to use for the rng during the sampling operation.
 
     Returns
     -------
@@ -677,10 +693,11 @@ def sampling_from_probs(
         if torch.any(torch.isnan(probs)):
             raise ValueError("Input probs contains NaN.")
     return get_sampling_module().sampling_from_probs(
-        probs, indices, deterministic, generator
+        probs, indices, deterministic, generator, seed, offset
     )
 
 
+@flashinfer_api
 def top_p_sampling_from_probs(
     probs: torch.Tensor,
     top_p: Union[torch.Tensor, float],
@@ -688,6 +705,8 @@ def top_p_sampling_from_probs(
     deterministic: bool = True,
     generator: Optional[torch.Generator] = None,
     check_nan: bool = False,
+    seed: Optional[int] = None,
+    offset: Optional[int] = None,
 ) -> torch.Tensor:
     r"""Fused GPU kernel for top-p sampling (nucleus sampling) from probabilities,
     this operator implements GPU-based rejection sampling without explicit sorting.
@@ -708,7 +727,7 @@ def top_p_sampling_from_probs(
         If a float, the same threshold is used for all requests.
         If a tensor, each request has its own threshold.
     indices: Optional[torch.Tensor]
-        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in probs.
+        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` that maps each output to a row in probs.
         For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
         This allows reusing the same probability distribution for multiple outputs.
         If indices is not provided, the i-th output will be sampled from the i-th row of probs.
@@ -718,6 +737,10 @@ def top_p_sampling_from_probs(
         A random number generator for the operation.
     check_nan: bool
         Whether to check nan in :attr:`probs`, default is ``False``.
+    seed: Optional[int]
+        seed value to use for the rng during the sampling operation.
+    offset: Optional[int]
+        offset value to use for the rng during the sampling operation.
 
     Returns
     -------
@@ -758,12 +781,18 @@ def top_p_sampling_from_probs(
     if check_nan:
         if torch.any(torch.isnan(probs)):
             raise ValueError("Input probs contains NaN.")
-    _check_tensor_param(top_p, probs)
     return get_sampling_module().top_p_sampling_from_probs(
-        probs, indices, *_to_tensor_scalar_tuple(top_p), deterministic, generator
+        probs,
+        indices,
+        *_to_tensor_scalar_tuple(top_p),
+        deterministic,
+        generator,
+        seed,
+        offset,
     )
 
 
+@flashinfer_api
 def top_k_sampling_from_probs(
     probs: torch.Tensor,
     top_k: Union[torch.Tensor, int],
@@ -771,6 +800,8 @@ def top_k_sampling_from_probs(
     deterministic: bool = True,
     generator: Optional[torch.Generator] = None,
     check_nan: bool = False,
+    seed: Optional[int] = None,
+    offset: Optional[int] = None,
 ) -> torch.Tensor:
     r"""Fused GPU kernel for top-k sampling from probabilities,
     this operator implements GPU-based rejection sampling without explicit sorting.
@@ -791,7 +822,7 @@ def top_k_sampling_from_probs(
         If a scalar, the same threshold is used for all requests.
         If a tensor, each request has its own threshold.
     indices: Optional[torch.Tensor]
-        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in probs.
+        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` that maps each output to a row in probs.
         For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
         This allows reusing the same probability distribution for multiple outputs.
         If indices is not provided, the i-th output will be sampled from the i-th row of probs.
@@ -801,6 +832,10 @@ def top_k_sampling_from_probs(
         A random number generator for the operation.
     check_nan: bool
         Whether to check nan in :attr:`probs`, default is ``False``.
+    seed: Optional[int]
+        seed value to use for the rng during the sampling operation.
+    offset: Optional[int]
+        offset value to use for the rng during the sampling operation.
 
     Returns
     -------
@@ -841,12 +876,18 @@ def top_k_sampling_from_probs(
     if check_nan:
         if torch.any(torch.isnan(probs)):
             raise ValueError("Input probs contains NaN.")
-    _check_tensor_param(top_k, probs)
     return get_sampling_module().top_k_sampling_from_probs(
-        probs, indices, *_to_tensor_scalar_tuple(top_k), deterministic, generator
+        probs,
+        indices,
+        *_to_tensor_scalar_tuple(top_k),
+        deterministic,
+        generator,
+        seed,
+        offset,
     )
 
 
+@flashinfer_api
 def min_p_sampling_from_probs(
     probs: torch.Tensor,
     min_p: Union[torch.Tensor, float],
@@ -854,6 +895,8 @@ def min_p_sampling_from_probs(
     deterministic: bool = True,
     generator: Optional[torch.Generator] = None,
     check_nan: bool = False,
+    seed: Optional[int] = None,
+    offset: Optional[int] = None,
 ) -> torch.Tensor:
     r"""Fused GPU kernel for `min_p sampling <https://arxiv.org/abs/2407.01082>`_ from probabilities,
 
@@ -875,7 +918,7 @@ def min_p_sampling_from_probs(
         If a scalar, the same threshold is used for all requests.
         If a tensor, each request has its own threshold.
     indices: Optional[torch.Tensor]
-        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in probs.
+        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` that maps each output to a row in probs.
         For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
         This allows reusing the same probability distribution for multiple outputs.
         If indices is not provided, the i-th output will be sampled from the i-th row of probs.
@@ -885,6 +928,10 @@ def min_p_sampling_from_probs(
         A random number generator for the operation.
     check_nan: bool
         Whether to check nan in :attr:`probs`, default is ``False``.
+    seed: Optional[int]
+        seed value to use for the rng during the sampling operation.
+    offset: Optional[int]
+        offset value to use for the rng during the sampling operation.
 
     Returns
     -------
@@ -920,12 +967,18 @@ def min_p_sampling_from_probs(
     if check_nan:
         if torch.any(torch.isnan(probs)):
             raise ValueError("Input probs contains NaN.")
-    _check_tensor_param(min_p, probs)
     return get_sampling_module().min_p_sampling_from_probs(
-        probs, indices, *_to_tensor_scalar_tuple(min_p), deterministic, generator
+        probs,
+        indices,
+        *_to_tensor_scalar_tuple(min_p),
+        deterministic,
+        generator,
+        seed,
+        offset,
     )
 
 
+@flashinfer_api
 def top_k_top_p_sampling_from_logits(
     logits: torch.Tensor,
     top_k: Union[torch.Tensor, int],
@@ -935,6 +988,8 @@ def top_k_top_p_sampling_from_logits(
     deterministic: bool = True,
     generator: Optional[torch.Generator] = None,
     check_nan: bool = False,
+    seed: Optional[int] = None,
+    offset: Optional[int] = None,
 ) -> torch.Tensor:
     r"""Fused GPU kernel for top-k and top-p sampling from pre-softmax logits,
 
@@ -960,7 +1015,7 @@ def top_k_top_p_sampling_from_logits(
         If a scalar, the same threshold is used for all requests.
         If a tensor, each request has its own threshold.
     indices: Optional[torch.Tensor]
-        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in probs.
+        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` that maps each output to a row in probs.
         For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
         This allows reusing the same probability distribution for multiple outputs.
         If indices is not provided, the i-th output will be sampled from the i-th row of probs.
@@ -974,6 +1029,10 @@ def top_k_top_p_sampling_from_logits(
         A random number generator for the operation.
     check_nan: bool
         Whether to check nan in :attr:`probs`, default is ``False``.
+    seed: Optional[int]
+        seed value to use for the rng during the sampling operation.
+    offset: Optional[int]
+        offset value to use for the rng during the sampling operation.
 
     Returns
     -------
@@ -1018,8 +1077,6 @@ def top_k_top_p_sampling_from_logits(
     top_k_mask_logits
     top_p_sampling_from_probs
     """
-    _check_tensor_param(top_k, logits)
-    _check_tensor_param(top_p, logits)
     if filter_apply_order == "top_k_first":
         masked_logits = top_k_mask_logits(logits, top_k)
         probs = torch.softmax(masked_logits, dim=-1)
@@ -1030,6 +1087,8 @@ def top_k_top_p_sampling_from_logits(
             deterministic,
             check_nan=check_nan,
             generator=generator,
+            seed=seed,
+            offset=offset,
         )
     elif filter_apply_order == "joint":
         probs = torch.softmax(logits, dim=-1)
@@ -1043,11 +1102,14 @@ def top_k_top_p_sampling_from_logits(
             *_to_tensor_scalar_tuple(top_p),
             deterministic,
             generator,
+            seed,
+            offset,
         )
     else:
         raise ValueError(f"Invalid filter_apply_order: {filter_apply_order}")
 
 
+@flashinfer_api
 def top_k_top_p_sampling_from_probs(
     probs: torch.Tensor,
     top_k: Union[torch.Tensor, int],
@@ -1057,6 +1119,8 @@ def top_k_top_p_sampling_from_probs(
     deterministic: bool = True,
     generator: Optional[torch.Generator] = None,
     check_nan: bool = False,
+    seed: Optional[int] = None,
+    offset: Optional[int] = None,
 ) -> torch.Tensor:
     r"""Fused GPU kernel for top-k and top-p sampling from probabilities,
 
@@ -1082,7 +1146,7 @@ def top_k_top_p_sampling_from_probs(
         If a scalar, the same threshold is used for all requests.
         If a tensor, each request has its own threshold.
     indices: Optional[torch.Tensor]
-        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in probs.
+        Optional indices tensor of shape ``(batch_size,)``, dtype ``torch.int32`` that maps each output to a row in probs.
         For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
         This allows reusing the same probability distribution for multiple outputs.
         If indices is not provided, the i-th output will be sampled from the i-th row of probs.
@@ -1096,6 +1160,10 @@ def top_k_top_p_sampling_from_probs(
         A random number generator for the operation.
     check_nan: bool
         Whether to check nan in :attr:`probs`, default is ``False``.
+    seed: Optional[int]
+        seed value to use for the rng during the sampling operation.
+    offset: Optional[int]
+        offset value to use for the rng during the sampling operation.
 
     Returns
     -------
@@ -1135,8 +1203,6 @@ def top_k_top_p_sampling_from_probs(
     top_p_renorm_probs
     top_k_mask_logits
     """
-    _check_tensor_param(top_k, probs)
-    _check_tensor_param(top_p, probs)
     if filter_apply_order == "top_k_first":
         renorm_probs = top_k_renorm_probs(probs, top_k)
         return top_p_sampling_from_probs(
@@ -1146,6 +1212,8 @@ def top_k_top_p_sampling_from_probs(
             deterministic,
             check_nan=check_nan,
             generator=generator,
+            seed=seed,
+            offset=offset,
         )
     elif filter_apply_order == "joint":
         if check_nan:
@@ -1158,11 +1226,14 @@ def top_k_top_p_sampling_from_probs(
             *_to_tensor_scalar_tuple(top_p),
             deterministic,
             generator,
+            seed,
+            offset,
         )
     else:
         raise ValueError(f"Invalid filter_apply_order: {filter_apply_order}")
 
 
+@flashinfer_api
 def top_p_renorm_probs(
     probs: torch.Tensor,
     top_p: Union[torch.Tensor, float],
@@ -1220,7 +1291,6 @@ def top_p_renorm_probs(
     sampling_from_probs
     top_k_renorm_probs
     """
-    _check_tensor_param(top_p, probs)
     return get_sampling_module().top_p_renorm_probs(
         probs, *_to_tensor_scalar_tuple(top_p)
     )
@@ -1229,6 +1299,7 @@ def top_p_renorm_probs(
 top_p_renorm_prob = top_p_renorm_probs
 
 
+@flashinfer_api
 def top_k_renorm_probs(
     probs: torch.Tensor,
     top_k: Union[torch.Tensor, int],
@@ -1285,7 +1356,6 @@ def top_k_renorm_probs(
     sampling_from_probs
     top_p_renorm_probs
     """
-    _check_tensor_param(top_k, probs)
     return get_sampling_module().top_k_renorm_probs(
         probs, *_to_tensor_scalar_tuple(top_k)
     )
@@ -1294,6 +1364,7 @@ def top_k_renorm_probs(
 top_k_renorm_prob = top_k_renorm_probs
 
 
+@flashinfer_api
 def top_k_mask_logits(
     logits: torch.Tensor, top_k: Union[torch.Tensor, int]
 ) -> torch.Tensor:
@@ -1345,12 +1416,12 @@ def top_k_mask_logits(
     --------
     top_k_renorm_probs
     """
-    _check_tensor_param(top_k, logits)
     return get_sampling_module().top_k_mask_logits(
         logits, *_to_tensor_scalar_tuple(top_k)
     )
 
 
+@flashinfer_api
 def chain_speculative_sampling(
     draft_probs,
     draft_token_ids,
@@ -1359,6 +1430,8 @@ def chain_speculative_sampling(
     maybe_output_emitted_draft_token_num: Optional[torch.Tensor] = None,
     deterministic: bool = True,
     generator: Optional[torch.Generator] = None,
+    seed: Optional[int] = None,
+    offset: Optional[int] = None,
 ) -> torch.Tensor:
     r"""Fused-GPU kernel for speculative sampling for sequence generation (proposed in
     paper `Accelerating Large Language Model Decoding with Speculative Sampling <https://arxiv.org/pdf/2302.01318>`_),
@@ -1394,6 +1467,10 @@ def chain_speculative_sampling(
         Whether to use deterministic kernel implementation, default is ``True``.
     generator: Optional[torch.Generator]
         A random number generator for the operation.
+    seed: Optional[int]
+        seed value to use for the rng during the sampling operation.
+    offset: Optional[int]
+        offset value to use for the rng during the sampling operation.
 
     Returns
     -------
@@ -1460,5 +1537,7 @@ def chain_speculative_sampling(
         output_emitted_draft_token_num,
         deterministic,
         generator,
+        seed,
+        offset,
     )
     return output_token_ids, output_accepted_token_num, output_emitted_draft_token_num
diff --git a/flashinfer/sparse.py b/flashinfer/sparse.py
index 36e26bb684..652194ab17 100644
--- a/flashinfer/sparse.py
+++ b/flashinfer/sparse.py
@@ -19,8 +19,8 @@
 
 import torch
 
+from .api_logging import flashinfer_api
 from .decode import get_batch_decode_module
-from .page import block_sparse_indices_to_vector_sparse_offsets
 from .prefill import _compute_page_mask_indptr, get_batch_prefill_module
 from .quantization import segment_packbits
 from .utils import (
@@ -107,6 +107,7 @@ class BlockSparseAttentionWrapper:
     True
     """
 
+    @flashinfer_api
     def __init__(
         self,
         float_workspace_buffer: torch.Tensor,
@@ -133,16 +134,6 @@ def __init__(
         self._int_workspace_buffer = torch.empty(
             (8 * 1024 * 1024,), dtype=torch.uint8, device=self.device
         )
-        if backend in ["fa3", "auto"]:
-            # NOTE(Zihao): assume maximum accumulate kv length is 128M
-            # NOTE(Yilong): 128M is required by video DiT models
-            self._vector_sparse_indices_buffer = torch.empty(
-                (128 * 1024 * 1024,), dtype=torch.int32, device=self.device
-            )
-            # NOTE(Zihao): assume maximum batch size is 32768
-            self._vector_sparse_indptr_buffer = torch.empty(
-                (32768,), dtype=torch.int32, device=self.device
-            )
 
         self._kv_lens_buffer = torch.empty(
             (32768,), dtype=torch.int32, device=self.device
@@ -171,8 +162,6 @@ def reset_workspace_buffer(
         self,
         float_workspace_buffer: torch.Tensor,
         int_workspace_buffer: torch.Tensor,
-        vector_sparse_indices_buffer: Optional[torch.Tensor] = None,
-        vector_sparse_indptr_buffer: Optional[torch.Tensor] = None,
     ) -> None:
         r"""Reset the workspace buffer.
 
@@ -197,12 +186,7 @@ def reset_workspace_buffer(
             pin_memory=True,
         )
 
-        # Enable user-defined size
-        if vector_sparse_indices_buffer is not None:
-            self._vector_sparse_indices_buffer = vector_sparse_indices_buffer
-        if vector_sparse_indptr_buffer is not None:
-            self._vector_sparse_indptr_buffer = vector_sparse_indptr_buffer
-
+    @flashinfer_api
     def plan(
         self,
         indptr: torch.Tensor,
@@ -438,20 +422,6 @@ def plan(
                 kv_lens_arr_host,
             )
 
-            if self._backend == "fa3":
-                if self.C != 1:
-                    vector_sparse_indptr_host = torch.cat(
-                        [
-                            torch.tensor([0], dtype=torch.int32),
-                            torch.cumsum(kv_lens_arr_host, dim=0, dtype=torch.int32),
-                        ],
-                        dim=0,
-                    )
-                    self._vector_sparse_indptr_buffer[
-                        : len(vector_sparse_indptr_host)
-                    ].copy_(vector_sparse_indptr_host, non_blocking=non_blocking)
-                    kv_indptr_host = vector_sparse_indptr_host
-
             args = [
                 self._float_workspace_buffer,
                 self._int_workspace_buffer,
@@ -473,6 +443,7 @@ def plan(
             if self._backend == "fa2":
                 args.append(-1)  # fixed_split_size
                 args.append(False)  # disable_split_kv
+                args.append(0)  # num_colocated_ctas
             self._plan_info = self._cached_module.plan(
                 *args,
             )
@@ -510,6 +481,7 @@ def forward(
         self._rope_theta = rope_theta
         return self.run(q, k, v, scale_q, scale_k, scale_v)
 
+    @flashinfer_api
     def run(
         self,
         q: torch.Tensor,
@@ -581,9 +553,6 @@ def run(
         k = k.reshape(-1, self.C, *k.shape[-2:])
         v = v.reshape(-1, self.C, *v.shape[-2:])
 
-        stride_block = k.stride(0)
-        stride_n = k.stride(1)
-
         if return_lse:
             if lse is None:
                 lse = torch.empty(
@@ -612,30 +581,6 @@ def run(
                 scale_v = torch.ones(v.shape[1], dtype=torch.float32, device=q.device)
 
         if self._use_tensor_cores:
-            if self._backend == "fa3":
-                if (
-                    self._vector_sparse_indices_buffer.numel()
-                    <= self._paged_kv_indices_buf.numel() * self.C
-                ):
-                    raise ValueError(
-                        "_vector_sparse_indices_buffer is not large enough. Please increase the size."
-                    )
-
-                sparse_indices = block_sparse_indices_to_vector_sparse_offsets(
-                    self._paged_kv_indices_buf,
-                    self._paged_kv_indptr_buf,
-                    self._vector_sparse_indices_buffer,  # output
-                    self._vector_sparse_indptr_buffer,
-                    self._kv_lens_buffer,
-                    stride_block // stride_n,
-                    1,  # stride_n // stride_n
-                    self.C,  # block_size
-                )
-                sparse_indptr = self._vector_sparse_indptr_buffer
-            else:
-                sparse_indices = self._paged_kv_indices_buf
-                sparse_indptr = self._paged_kv_indptr_buf
-
             self._cached_module.paged_run(
                 self._float_workspace_buffer,
                 self._int_workspace_buffer,
@@ -644,8 +589,8 @@ def run(
                 k,
                 v,
                 self._qo_indptr,
-                sparse_indptr,
-                sparse_indices,
+                self._paged_kv_indptr_buf,
+                self._paged_kv_indices_buf,
                 self._paged_kv_last_page_len,
                 out,
                 lse,
@@ -734,6 +679,7 @@ class VariableBlockSparseAttentionWrapper:
     >>> o = wrapper.run(q, k, v)
     """
 
+    @flashinfer_api
     def __init__(
         self,
         float_workspace_buffer: torch.Tensor,
@@ -760,13 +706,6 @@ def __init__(
         self._int_workspace_buffer = torch.empty(
             (8 * 1024 * 1024,), dtype=torch.uint8, device=self.device
         )
-        if backend in ["fa3", "auto"]:
-            self._vector_sparse_indices_buffer = torch.empty(
-                (128 * 1024 * 1024,), dtype=torch.int32, device=self.device
-            )
-            self._vector_sparse_indptr_buffer = torch.empty(
-                (32768,), dtype=torch.int32, device=self.device
-            )
 
         self._kv_lens_buffer = torch.empty(
             (32768,), dtype=torch.int32, device=self.device
@@ -789,8 +728,6 @@ def reset_workspace_buffer(
         self,
         float_workspace_buffer: torch.Tensor,
         int_workspace_buffer: torch.Tensor,
-        vector_sparse_indices_buffer: Optional[torch.Tensor] = None,
-        vector_sparse_indptr_buffer: Optional[torch.Tensor] = None,
     ) -> None:
         r"""Reset the workspace buffer.
 
@@ -815,12 +752,7 @@ def reset_workspace_buffer(
             pin_memory=True,
         )
 
-        # Enable user-defined size
-        if vector_sparse_indices_buffer is not None:
-            self._vector_sparse_indices_buffer = vector_sparse_indices_buffer
-        if vector_sparse_indptr_buffer is not None:
-            self._vector_sparse_indptr_buffer = vector_sparse_indptr_buffer
-
+    @flashinfer_api
     def plan(
         self,
         block_mask_map: torch.Tensor,
@@ -1033,14 +965,6 @@ def _block_mask_map_to_expanded_indices(
             kv_lens_arr_host,
         )
 
-        if self._backend == "fa3":
-            if self._vector_sparse_indptr_buffer.numel() <= kv_indptr.numel():
-                raise ValueError(
-                    "_vector_sparse_indptr_buffer is not large enough. Please increase the buffer size."
-                )
-            self._vector_sparse_indptr_buffer[: len(kv_indptr)].copy_(
-                kv_indptr, non_blocking=non_blocking
-            )
         args = [
             self._float_workspace_buffer,
             self._int_workspace_buffer,
@@ -1062,6 +986,7 @@ def _block_mask_map_to_expanded_indices(
         if self._backend == "fa2":
             args.append(-1)  # fixed_split_size
             args.append(False)  # disable_split_kv
+            args.append(0)  # num_colocated_ctas
         self._plan_info = self._cached_module.plan(
             *args,
         )
@@ -1096,6 +1021,7 @@ def forward(
         self._rope_theta = rope_theta
         return self.run(q, k, v)
 
+    @flashinfer_api
     def run(
         self,
         q: torch.Tensor,
@@ -1174,9 +1100,6 @@ def run(
             "num_kv_heads kv_len head_dim -> (num_kv_heads kv_len) 1 1 head_dim",
         ).contiguous()
 
-        stride_block = k.stride(0)
-        stride_n = k.stride(1)
-
         if return_lse:
             if lse is None:
                 lse = torch.empty(
@@ -1192,30 +1115,6 @@ def run(
         else:
             check_shape_dtype_device(out, q.shape, self._o_dtype, q.device, "out")
 
-        if self._backend == "fa3":
-            if (
-                self._vector_sparse_indices_buffer.numel()
-                <= self._paged_kv_indices_buf.numel()
-            ):
-                raise ValueError(
-                    "_vector_sparse_indices_buffer is not large enough. Please increase the buffer size."
-                )
-
-            sparse_indices = block_sparse_indices_to_vector_sparse_offsets(
-                self._paged_kv_indices_buf,
-                self._paged_kv_indptr_buf,
-                self._vector_sparse_indices_buffer,  # output
-                self._vector_sparse_indptr_buffer,
-                self._kv_lens_buffer,
-                stride_block // stride_n,
-                1,  # stride_n // stride_n
-                1,  # block_size
-            )
-            sparse_indptr = self._vector_sparse_indptr_buffer
-        else:
-            sparse_indices = self._paged_kv_indices_buf
-            sparse_indptr = self._paged_kv_indptr_buf
-
         self._cached_module.paged_run(
             self._float_workspace_buffer,
             self._int_workspace_buffer,
@@ -1224,8 +1123,8 @@ def run(
             k,
             v,
             self._qo_indptr,
-            sparse_indptr,
-            sparse_indices,
+            self._paged_kv_indptr_buf,
+            self._paged_kv_indices_buf,
             self._paged_kv_last_page_len,
             out,
             lse,
diff --git a/flashinfer/testing/utils.py b/flashinfer/testing/utils.py
index 46ede8de2b..a784cf945b 100644
--- a/flashinfer/testing/utils.py
+++ b/flashinfer/testing/utils.py
@@ -277,6 +277,12 @@ def attention_flops(
     Returns:
         total_flops (int): Total FLOPs for the layer.
     """
+    # Causal attention requires kv_len >= q_len
+    if qo_seqlen > kv_seqlen:
+        raise ValueError(
+            "qo_seqlen must be less than or equal to kv_seqlen for causal attention"
+        )
+
     if causal:
         bmm1_flops = (
             batch_size
@@ -323,6 +329,13 @@ def attention_flops_with_actual_seq_lens(
     Returns:
         total_flops (int): Total FLOPs for the layer.
     """
+    # Causal attention requires kv_len >= q_len
+    # Otherwise right align if kv_len > q_len
+    if causal and (actual_seq_lens_q > actual_seq_lens_kv).any():
+        raise ValueError(
+            "actual_seq_lens_q must be less than or equal to actual_seq_lens_kv for causal attention"
+        )
+
     if causal:
         bmm1_flops = (
             torch.dot(
@@ -412,7 +425,7 @@ def attention_tflops_per_sec_with_actual_seq_lens(
     head_dim_vo,
     num_qo_heads,
     causal,
-    time,
+    ms,
 ):
     """
     Calculate TFLOPS per second for a given attention layer with actual sequence lengths.
@@ -425,7 +438,7 @@ def attention_tflops_per_sec_with_actual_seq_lens(
         head_dim_vo (int): Head dimension of the value.
         num_qo_heads (int): Number of query heads.
         causal (bool): Whether to use causal masking.
-        time (float): Execution time in milliseconds.
+        ms (float): Execution time in milliseconds.
 
     Returns:
         tflops_per_sec (float): TFLOPS per second for the layer.
@@ -438,7 +451,7 @@ def attention_tflops_per_sec_with_actual_seq_lens(
         num_qo_heads,
         causal,
     )
-    return f.item() / time / 1e9 if not math.isnan(time) else 0.0
+    return f.item() / ms / 1e9 if not math.isnan(ms) else 0.0
 
 
 def attention_tb_per_sec(
@@ -727,7 +740,7 @@ def func_buffer_requested():
         return buffer_size, max_num_records
 
     def func_buffer_completed(
-        launches: list[tuple[float, float, int]],
+        launches: list[tuple[float, float, int, int, int]],
         kernels: list[tuple[str, float, float, int]],
         activities: list,
     ):
@@ -742,9 +755,20 @@ def func_buffer_completed(
                         activity.correlation_id,
                     )
                 )
-            elif activity.kind == cupti.ActivityKind.RUNTIME:
-                # Runtime activity
-                launches.append((activity.start, activity.end, activity.correlation_id))
+            elif activity.kind in (
+                cupti.ActivityKind.RUNTIME,
+                cupti.ActivityKind.DRIVER,
+            ):
+                # Runtime or Driver activity
+                launches.append(
+                    (
+                        activity.start,
+                        activity.end,
+                        activity.correlation_id,
+                        activity.cbid,
+                        activity.kind,
+                    )
+                )
 
     if l2_flush:
         l2_flush_size = int(l2_flush_size_mb) * 1024 * 1024
@@ -802,11 +826,12 @@ def func_buffer_completed(
     torch.cuda.synchronize()
 
     # CUPTI measurement
-    launches: list[tuple[float, float, int]] = []
+    launches: list[tuple[float, float, int, int, int]] = []
     kernels: list[tuple[str, float, float, int]] = []
     iter_timestamps = []
     cupti.activity_enable(cupti.ActivityKind.RUNTIME)
     cupti.activity_enable(cupti.ActivityKind.CONCURRENT_KERNEL)
+    cupti.activity_enable(cupti.ActivityKind.DRIVER)
     cupti.activity_register_callbacks(
         func_buffer_requested, partial(func_buffer_completed, launches, kernels)
     )
@@ -823,6 +848,7 @@ def func_buffer_completed(
     cupti.activity_flush_all(0)
     cupti.activity_disable(cupti.ActivityKind.RUNTIME)
     cupti.activity_disable(cupti.ActivityKind.CONCURRENT_KERNEL)
+    cupti.activity_disable(cupti.ActivityKind.DRIVER)
     cupti.finalize()
 
     # Process activities
diff --git a/flashinfer/triton/kernels/cascade.py b/flashinfer/triton/kernels/cascade.py
index 0439dc0440..88a9450010 100644
--- a/flashinfer/triton/kernels/cascade.py
+++ b/flashinfer/triton/kernels/cascade.py
@@ -148,8 +148,9 @@ def variable_length_merge_states_kernel(
         for head_idx in tl.range(bdy):
             o, m, d = 0.0, -5e4, 1.0
             for iter in tl.range(tl.load(indptr + pos), tl.load(indptr + pos + 1)):
-                s = tl.load(s_ptr + iter * num_heads + head_idx)
-                v = tl.load(v_ptr + (iter * num_heads + head_idx) * head_dim + tx)
+                iter_i64 = iter.to(tl.int64)
+                s = tl.load(s_ptr + iter_i64 * num_heads + head_idx)
+                v = tl.load(v_ptr + (iter_i64 * num_heads + head_idx) * head_dim + tx)
                 o, m, d = state_merge(o, m, d, v, s, 1)
             o, m, d = state_normalize(o, m, d)
             tl.store(v_merged_ptr + (pos * num_heads + head_idx) * head_dim + tx, o)
diff --git a/flashinfer/utils.py b/flashinfer/utils.py
index 936d08380c..2efcf1ff64 100644
--- a/flashinfer/utils.py
+++ b/flashinfer/utils.py
@@ -21,8 +21,10 @@
 
 import torch
 import torch.version
+import pynvml
 from torch.torch_version import TorchVersion
 from torch.torch_version import __version__ as torch_version
+import inspect
 
 from .jit.spdlog import gen_spdlog_module
 
@@ -254,6 +256,46 @@ def get_compute_capability(device: torch.device) -> Tuple[int, int]:
     return torch.cuda.get_device_capability(device.index)
 
 
+@functools.cache
+def get_gpu_memory_bandwidth(device: torch.device) -> float:
+    """
+    Get GPU memory bandwidth in GB/s for the specified CUDA device.
+
+    Args:
+        device: torch.device object, e.g., torch.device('cuda:0')
+
+    Returns:
+        float: GPU memory bandwidth (GB/s)
+
+    Raises:
+        ValueError: If device is not a CUDA device
+    """
+    # Convert to torch.device object if string is passed
+    if isinstance(device, str):
+        device = torch.device(device)
+
+    # Check if it's a CUDA device
+    if device.type != "cuda":
+        raise ValueError(f"Device must be a CUDA device, got {device}")
+
+    # Get device index
+    device_index = device.index if device.index is not None else 0
+
+    # Use pynvml to get bandwidth
+    pynvml.nvmlInit()
+    try:
+        handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
+        bus_width = pynvml.nvmlDeviceGetMemoryBusWidth(handle)
+        mem_clock = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM)
+
+        # Calculate theoretical peak bandwidth (GB/s)
+        bandwidth = (mem_clock * bus_width * 2) / 8 / 1000
+
+        return bandwidth
+    finally:
+        pynvml.nvmlShutdown()
+
+
 def _check_cached_qkv_data_type(
     q: torch.Tensor, k: torch.Tensor, dtype_q: torch.dtype, dtype_kv: torch.dtype
 ) -> None:
@@ -744,8 +786,8 @@ def get_shuffle_matrix_a_row_indices(
 def get_shuffle_matrix_sf_a_row_indices(
     input_tensor: torch.Tensor, epilogue_tile_m: int, num_elts_per_sf: int = 16
 ) -> torch.Tensor:
-    assert input_tensor.dtype == torch.uint8
-    assert num_elts_per_sf == 16
+    assert input_tensor.dtype == torch.uint8 or input_tensor.dtype == torch.bfloat16
+    assert num_elts_per_sf == 16 or num_elts_per_sf == 32
 
     assert input_tensor.dim() == 2, (
         f"input_tensor should be a 2D tensor, not {input_tensor.dim()}"
@@ -853,7 +895,9 @@ def is_cc_supported(cc):
 
 
 def backend_requirement(
-    backend_checks: Dict[str, Callable], common_check: Optional[Callable] = None
+    backend_checks: Dict[str, Callable],
+    common_check: Optional[Callable] = None,
+    heuristic_func: Optional[Callable] = None,
 ) -> Callable:
     """
     Decorator to enforce backend and problem size requirements for kernel functions.
@@ -876,6 +920,14 @@ def backend_requirement(
         An optional function that performs additional validation checks common to all
         backends. Should accept the same arguments as the decorated function and return
         True if requirements are met, False otherwise.
+        In the case where the kernel function does not have any specific backends, this can be decorated with @supported_compute_capability to specify the function's supported compute capabilities.
+    heuristic_func : callable, optional
+        A function that performs heuristic backend selection when backend is "auto".
+        Must be provided if backend is "auto". Does not do anything if backend is not "auto".
+        Should accept the same arguments as the decorated function.
+        Should return an ordered list of runnable backends with the most preferred backend first.
+        When decorated function is not autotuned, the first backend in the heuristic list will be run.
+        When decorated function is autotuned, the backends in the heuristic list will be autotuned over to find the best backend.
 
     Returns
     -------
@@ -926,17 +978,17 @@ def backend_requirement(
     ...     # Backend invocation
     ...     pass
     ...
-    >>> # Check if backend is supported
-    >>> my_attention_kernel.is_backend_supported("cutlass")
-    True
-    >>> # Check if backend supports specific compute capability
-    >>> my_attention_kernel.is_backend_supported("cutlass", 75)
-    False
-    >>> my_attention_kernel.is_backend_supported("cutlass", 80)
-    True
-    >>> # Check if any backend supports a compute capability
-    >>> my_attention_kernel.is_compute_capability_supported(75)
-    True
+    >>> # Example with kernel function with no backend requirements
+    >>> @supported_compute_capability([80, 86, 89, 90])
+    ... def _common_size_check(q, k, v):
+    ...     return True
+    ...
+    >>> @backend_requirement(
+    ...     backend_checks={}, # Empty backend_checks
+    ...     common_check=_common_size_check
+    ... )
+    ... def backend_agnostic_kernel(q, k, v):
+    ...     pass
 
     Notes
     -----
@@ -950,29 +1002,54 @@ def backend_requirement(
     """
 
     def decorator(func):
+        # Get the function signature once for reuse
+        sig = inspect.signature(func)
+
         def is_backend_supported(backend, cc=None):
-            # Is this backend present?
-            if backend not in backend_checks:
+            # No backend-specific checks
+            if not has_backend_choices():
+                raise ValueError(
+                    f"Invalid is_backend_supported call: no backend choices for {func.__name__}"
+                )
+            else:
+                # Is this backend present?
+                if backend not in backend_checks:
+                    return False
+                req_checker = backend_checks[backend]
+                # If user just wants to check if the backend is supported (regardless of compute capability), return True
+                if cc is None:
+                    return True
+                # Check compute capability support via attribute on requirement function
+                elif hasattr(req_checker, "is_compute_capability_supported"):
+                    return req_checker.is_compute_capability_supported(cc)
                 return False
-            req_checker = backend_checks[backend]
-            # If user just wants to check if the backend is supported (regardless of compute capability), return True
-            if cc is None:
-                return True
-            # Check compute capability support via attribute on requirement function
-            elif hasattr(req_checker, "is_compute_capability_supported"):
-                return req_checker.is_compute_capability_supported(cc)
-            return False
 
         def is_compute_capability_supported(cc):
-            # True if any backend requirement supports this cc
-            return any(
-                hasattr(checker, "is_compute_capability_supported")
-                and checker.is_compute_capability_supported(cc)
-                for checker in backend_checks.values()
-            )
+            # In case there is only 1 implicit backend, the compute capability support needs to be added to the common check
+            if not has_backend_choices():
+                # No backend-specific checks, only check common_check
+                if not hasattr(common_check, "is_compute_capability_supported"):
+                    raise ValueError(
+                        f"Invalid is_compute_capability_supported call: {common_check.__name__} does not have is_compute_capability_supported decorator"
+                    )
+                return common_check.is_compute_capability_supported(cc)
+            else:
+                # True if any backend requirement supports this cc
+                return any(
+                    hasattr(checker, "is_compute_capability_supported")
+                    and checker.is_compute_capability_supported(cc)
+                    for checker in backend_checks.values()
+                )
 
-        def is_problem_size_supported(*args, **kwargs):
+        # @note: this function does not automatically apply defaults to the arguments.
+        def _is_problem_size_supported(*args, **kwargs):
+            # At this point, kwargs should have defaults applied, so backend should be present
             backend = kwargs.get("backend")
+
+            # Handle empty backend_checks case
+            if not has_backend_choices():
+                return common_check(*args, **kwargs)
+
             if backend not in backend_checks:
                 raise BackendSupportedError(
                     f"Backend '{backend}' is not supported for {func.__name__}"
@@ -983,46 +1060,120 @@ def is_problem_size_supported(*args, **kwargs):
             else:
                 return req_checker(*args, **kwargs)
 
+        def has_backend_choices() -> bool:
+            # Whether there are any backend choices to make
+            return bool(backend_checks)
+
+        def has_backend(backend: str) -> bool:
+            # Whether the given backend exists in the API
+            return backend in backend_checks
+
+        def suitable_auto_backends(cc, *args, **kwargs):
+            if common_check is not None and not common_check(*args, **kwargs):
+                return False
+            suitable_backends = []
+            # Check for each backend support
+            for backend in backend_checks:
+                req_checker = backend_checks[backend]
+                try:
+                    if req_checker(
+                        *args, **kwargs
+                    ) and req_checker.is_compute_capability_supported(cc):
+                        suitable_backends.append(backend)
+                except ValueError:
+                    continue
+            # If a heuristic function is provided, filter the suitable backends based on the heuristic function
+            assert heuristic_func is not None, "Heuristic function must be provided"
+            suitable_backends = heuristic_func(suitable_backends, *args, **kwargs)
+            if not suitable_backends:
+                return False
+            wrapper.suitable_auto_backends = suitable_backends
+            return True
+
+        def _get_capability(*args, **kwargs):
+            capability = None
+            # Find the first tensor argument.
+            # Assume all tensors are on the same device/capability.
+            # We could consider check all tensors at a performance cost.
+            tensor_arg = None
+            all_args = args + tuple(kwargs.values())
+            for value in all_args:
+                if isinstance(value, torch.Tensor):
+                    tensor_arg = value
+                    break
+
+            if tensor_arg is not None:
+                # Get compute capability from the first tensor
+                # Assume all tensors are on the same device/capability
+                major, minor = get_compute_capability(tensor_arg.device)
+                capability = major * 10 + minor
+            return capability
+
+        # @brief: Wrapper function that calls the orignal, decorated function, after applying a number of checks.
+        # @note that here we manually apply defaults to the arguments in the wrapper function when doing validation.
         @functools.wraps(func)
         def wrapper(*args, **kwargs):
-            backend = kwargs.get("backend")
             # skip_check is an optional argument that the decorator adds to any API function.
             # It prevents the performance overhead of checking.
             skip_check = kwargs.pop("skip_check", False)
 
             if not skip_check:
-                capability = None
-                # Find the first tensor argument.
-                # Assume all tensors are on the same device/capability.
-                # We could consider check all tensors at a performance cost.
-                tensor_arg = None
-                for arg in args:
-                    if isinstance(arg, torch.Tensor):
-                        tensor_arg = arg
-                if tensor_arg is None:
-                    for value in kwargs.values():
-                        if isinstance(value, torch.Tensor):
-                            tensor_arg = value
-
-                if tensor_arg is not None:
-                    # Get compute capability from the first tensor
-                    # Assume all tensors are on the same device/capability
-                    major, minor = get_compute_capability(tensor_arg.device)
-                    capability = major * 10 + minor
-
-                if not is_backend_supported(backend, capability):
-                    extra = f" with capability {capability}" if capability else ""
-                    raise BackendSupportedError(
-                        f"{func.__name__} does not support backend '{backend}'{extra}"
-                    )
-                if not is_problem_size_supported(*args, **kwargs):
+                # Apply defaults from the function signature for validation
+                # This ensures that all parameters (including backend) have their default values
+                # if not explicitly provided by the caller
+                bound_args = sig.bind(*args, **kwargs)
+                bound_args.apply_defaults()
+                # Convert to kwargs for validation functions
+                kwargs_with_defaults = dict(bound_args.arguments)
+                backend = kwargs_with_defaults.get("backend")
+                capability = _get_capability(*args, **kwargs)
+                if not has_backend_choices() and common_check is None:
                     raise ValueError(
-                        f"Problem size is not supported for {func.__name__}"
+                        f"Invalid @backend_requirement decorator usage: no backend choices and no common_check for {func.__name__}"
                     )
+
+                if has_backend_choices():
+                    if backend == "auto":
+                        if not suitable_auto_backends(
+                            capability, **kwargs_with_defaults
+                        ):
+                            raise BackendSupportedError(
+                                f"No suitable auto backends found for {func.__name__}"
+                            )
+                    else:
+                        if not is_backend_supported(backend, capability):
+                            extra = (
+                                f" with capability {capability}" if capability else ""
+                            )
+                            raise BackendSupportedError(
+                                f"{func.__name__} does not support backend '{backend}'{extra}"
+                            )
+                        if not _is_problem_size_supported(**kwargs_with_defaults):
+                            raise ValueError(
+                                f"Problem size is not supported for {func.__name__}"
+                            )
+                else:
+                    # If the function doesnt have backends (i.e., there is only 1, implicit backend), run the following checks.
+                    if not is_compute_capability_supported(capability):
+                        raise BackendSupportedError(
+                            f"{func.__name__} does not support compute capability {capability}"
+                        )
+                    if not _is_problem_size_supported(**kwargs_with_defaults):
+                        raise ValueError(
+                            f"Problem size is not supported for {func.__name__}"
+                        )
+            elif skip_check and heuristic_func is not None:
+                if kwargs.get("backend") == "auto":
+                    # This needs to be called for heuristic function
+                    capability = _get_capability(*args, **kwargs)
+                    suitable_auto_backends(capability, *args, **kwargs)
+
             return func(*args, **kwargs)
 
         wrapper.is_backend_supported = is_backend_supported
         wrapper.is_compute_capability_supported = is_compute_capability_supported
+        wrapper.has_backend = has_backend
+        wrapper.has_backend_choices = has_backend_choices
         return wrapper
 
     return decorator
diff --git a/flashinfer/xqa.py b/flashinfer/xqa.py
index 30997c2d10..729c79a718 100644
--- a/flashinfer/xqa.py
+++ b/flashinfer/xqa.py
@@ -16,9 +16,10 @@
 
 import functools
 from types import SimpleNamespace
-from typing import Optional
+from typing import Optional, Union
 import torch
 
+from .api_logging import flashinfer_api
 from .jit.xqa import gen_xqa_module, gen_xqa_module_mla
 from .jit.utils import filename_safe_dtype_map
 from .utils import (
@@ -26,6 +27,7 @@
     register_custom_op,
     register_fake_op,
     get_compute_capability,
+    device_support_pdl,
 )
 
 
@@ -37,6 +39,8 @@ def get_xqa_module(
     head_dim: int,
     head_group_ratio: int,
     use_sliding_window: bool,
+    output_dtype: torch.dtype,
+    q_seq_len: int,
 ):
     module = gen_xqa_module(
         input_dtype,
@@ -45,10 +49,17 @@ def get_xqa_module(
         head_dim,
         head_group_ratio,
         use_sliding_window,
+        output_dtype,
+        q_seq_len,
     ).build_and_load()
 
+    if q_seq_len > 1:
+        use_spec_dec = True
+    else:
+        use_spec_dec = False
+
     @register_custom_op(
-        f"flashinfer::xqa_input_{filename_safe_dtype_map[input_dtype]}_kv_cache_{filename_safe_dtype_map[kv_cache_dtype]}_page_size_{page_size}_head_dim_{head_dim}_head_group_ratio_{head_group_ratio}_use_sliding_window_{use_sliding_window}",
+        f"flashinfer::xqa_input_{filename_safe_dtype_map[input_dtype]}_kv_cache_{filename_safe_dtype_map[kv_cache_dtype]}_output_{filename_safe_dtype_map[output_dtype]}_page_size_{page_size}_head_dim_{head_dim}_head_group_ratio_{head_group_ratio}_use_sliding_window_{use_sliding_window}_use_spec_dec_{use_spec_dec}_spec_q_seq_len_{q_seq_len}",
         mutates_args=("output", "workspace_buffer"),
     )
     def xqa(
@@ -56,8 +67,9 @@ def xqa(
         sm_count: int,
         num_kv_heads: int,
         sliding_win_size: int,
-        q_scale: float,
+        q_scale: Union[float, torch.Tensor],
         output: torch.Tensor,
+        rcp_out_scale: float,
         q: torch.Tensor,
         sinks: Optional[torch.Tensor],
         k_cache: torch.Tensor,
@@ -66,17 +78,22 @@ def xqa(
         max_seq_len: int,
         seq_lens: torch.Tensor,
         batch_size: int,
-        kv_scale: torch.Tensor,
+        kv_scale: Union[float, torch.Tensor],
         semaphores: torch.Tensor,
         workspace_buffer: torch.Tensor,
+        enable_pdl: bool,
+        q_seq_len: int,
+        mask: Optional[torch.Tensor],
     ) -> None:
         module.xqa_wrapper(
             run_sm90_fp8_mha,
             sm_count,
             num_kv_heads,
             sliding_win_size,
-            q_scale,
+            1.0 if isinstance(q_scale, torch.Tensor) else q_scale,
+            None if isinstance(q_scale, float) else q_scale,
             output,
+            rcp_out_scale,
             q,
             sinks,
             k_cache,
@@ -85,21 +102,26 @@ def xqa(
             max_seq_len,
             seq_lens,
             batch_size,
-            kv_scale,
+            1.0 if isinstance(kv_scale, torch.Tensor) else kv_scale,
+            None if isinstance(kv_scale, float) else kv_scale,
+            q_seq_len,
+            mask,
             semaphores,
             workspace_buffer,
+            enable_pdl,
         )
 
     @register_fake_op(
-        f"flashinfer::xqa_input_{filename_safe_dtype_map[input_dtype]}_kv_cache_{filename_safe_dtype_map[kv_cache_dtype]}_page_size_{page_size}_head_dim_{head_dim}_head_group_ratio_{head_group_ratio}_use_sliding_window_{use_sliding_window}"
+        f"flashinfer::xqa_input_{filename_safe_dtype_map[input_dtype]}_kv_cache_{filename_safe_dtype_map[kv_cache_dtype]}_output_{filename_safe_dtype_map[output_dtype]}_page_size_{page_size}_head_dim_{head_dim}_head_group_ratio_{head_group_ratio}_use_sliding_window_{use_sliding_window}_use_spec_dec_{use_spec_dec}_spec_q_seq_len_{q_seq_len}"
     )
     def _fake_xqa(
         run_sm90_fp8_mha: bool,
         sm_count: int,
         num_kv_heads: int,
         sliding_win_size: int,
-        q_scale: float,
+        q_scale: Union[float, torch.Tensor],
         output: torch.Tensor,
+        rcp_out_scale: float,
         q: torch.Tensor,
         sinks: Optional[torch.Tensor],
         k_cache: torch.Tensor,
@@ -108,9 +130,12 @@ def _fake_xqa(
         max_seq_len: int,
         seq_lens: torch.Tensor,
         batch_size: int,
-        kv_scale: torch.Tensor,
+        kv_scale: Union[float, torch.Tensor],
         semaphores: torch.Tensor,
         workspace_buffer: torch.Tensor,
+        enable_pdl: bool,
+        q_seq_len: int,
+        mask: Optional[torch.Tensor],
     ) -> None:
         pass
 
@@ -119,6 +144,7 @@ def _fake_xqa(
     )
 
 
+@flashinfer_api
 def xqa(
     q: torch.Tensor,
     k_cache: torch.Tensor,
@@ -131,24 +157,32 @@ def xqa(
     num_kv_heads: int,
     page_size: int,
     sinks: Optional[torch.Tensor] = None,
-    q_scale: float = 1.0,
-    kv_scale: Optional[torch.Tensor] = None,
+    q_scale: Union[float, torch.Tensor] = 1.0,
+    kv_scale: Union[float, torch.Tensor] = 1.0,
     sliding_win_size: int = 0,
+    kv_layout: str = "NHD",
     sm_count: Optional[int] = None,
+    enable_pdl: Optional[bool] = None,
+    rcp_out_scale: float = 1.0,
+    q_seq_len: int = 1,
+    mask: Optional[torch.Tensor] = None,
 ) -> None:
     r"""Apply attention with paged KV cache using XQA kernel.
     Parameters
     ----------
     q : torch.Tensor
-        Query tensor with shape ``[batch_size, beam_width, num_q_heads, head_dim]``.
+        Query tensor with shape ``[batch_size, beam_width, num_q_heads, head_dim]`` if not using speculative decoding,
+        or ``[batch_size, beam_width, q_seq_len, num_q_heads, head_dim]`` if using speculative decoding. ``q_seq_len`` is the number of speculative decoding tokens.
         Data type should be torch.float16 or torch.bfloat16.
         Now only beam_width 1 is supported.
     k_cache: torch.Tensor
-        Paged K cache tensor with shape ``[total_num_cache_heads, head_dim]``.
+        Paged K cache tensor with shape ``[num_pages, page_size, num_kv_heads, head_dim]`` if :attr:`kv_layout` is ``NHD``,
+        or ``[num_pages, num_kv_heads, page_size, head_dim]`` if :attr:`kv_layout` is ``HND``.
         Data type should match query tensor or be torch.float8_e4m3fn, in which case xqa will run fp8 calculation.
         Should be the same data type as v_cache.
     v_cache: torch.Tensor
-        Paged V cache tensor with shape ``[total_num_cache_heads, head_dim]``.
+        Paged V cache tensor with shape ``[num_pages, page_size, num_kv_heads, head_dim]`` if :attr:`kv_layout` is ``NHD``,
+        or ``[num_pages, num_kv_heads, page_size, head_dim]`` if :attr:`kv_layout` is ``HND``.
         Data type should match query tensor or be torch.float8_e4m3fn, in which case xqa will run fp8 calculation.
         Should be the same data type as k_cache.
     page_table : torch.Tensor
@@ -159,8 +193,8 @@ def xqa(
         Sequence lengths tensor with shape ``[batch_size, beam_width]``.
         Data type should be torch.uint32.
     output : torch.Tensor
-        Output tensor with shape ``[batch_size, beam_width, num_q_heads, head_dim]``.
-        Data type should match query tensor. This tensor will be modified in-place.
+        Output tensor with shape that matches the query tensor.
+        Data type should match query tensor or kv tensor. This tensor will be modified in-place.
     workspace_buffer : torch.Tensor
         Workspace buffer for temporary computations.
         Data type should be torch.uint8.
@@ -175,22 +209,35 @@ def xqa(
         Attention sink values with shape ``[num_kv_heads, head_group_ratio]``.
         Data type should be torch.float32.
         If None, no attention sinks are used.
-    q_scale : float, default=1.0
+    q_scale : Union[float, torch.Tensor], default=1.0
         Scale factor for query tensor.
-    kv_scale : Optional[torch.Tensor], default=None
-        Scale factor for KV cache with shape ``[1]``.
-        Data type should be torch.float32.
-        If None, defaults to 1.0.
+    kv_scale : Union[float, torch.Tensor], default=1.0
+        Scale factor for KV cache.
     sliding_win_size : int, default=0
         Sliding window size for attention. If 0, no sliding window is used.
+    kv_layout : str, default="NHD"
+        The layout of the KV cache. Can be either ``NHD`` or ``HND``.
     sm_count : Optional[int], default=None
         Number of streaming multiprocessors to use.
         If None, will be inferred from the device.
+    enable_pdl : Optional[bool], default=None
+        Whether to enable PDL (Persistent Data Loader) optimization.
+        If None, will be set to True if hardware supports it.
+    rcp_out_scale : float, default=1.0
+        Reciprocal of output scale factor.
+    q_seq_len : int, default=1
+        Query sequence length. When > 1, enables speculative decoding mode.
+    mask : Optional[torch.Tensor], default=None
+        Causal attention mask for speculative decoding mode (when ``q_seq_len > 1``).
+        Shape: ``[batch_size, q_seq_len, mask_size_per_row]`` where
+        ``mask_size_per_row = ((q_seq_len + 31) // 32) * 2``.
+        Data type should be torch.uint16 (bit-packed format, aligned to 32 bits).
+
     Note
     ----
     The function automatically infers several parameters from tensor shapes:
     - batch_size from q.shape[0]
-    - num_q_heads from q.shape[2]
+    - num_q_heads from q.shape[-2]
     - head_dim from q.shape[-1]
     - input_dtype from q.dtype
     - kv_cache_dtype from k.dtype
@@ -201,12 +248,11 @@ def xqa(
     if sm_count is None:
         sm_count = get_device_sm_count(q.device)
 
-    if kv_scale is None:
-        kv_scale = torch.ones(1, dtype=torch.float32, device=q.device)
+    enable_pdl = enable_pdl if enable_pdl is not None else device_support_pdl(q.device)
 
     # Infer parameters from tensors
     batch_size = q.shape[0]
-    num_q_heads = q.shape[2]
+    num_q_heads = q.shape[-2]
     head_dim = q.shape[-1]
 
     # Calculate head_group_ratio
@@ -221,6 +267,19 @@ def xqa(
 
     assert k_cache.dtype == v_cache.dtype, "K and V cache must have the same dtype"
 
+    if output.dtype == torch.float8_e4m3fn:
+        assert k_cache.dtype == torch.float8_e4m3fn, (
+            "KV cache must be fp8 when output is fp8"
+        )
+    else:
+        assert output.dtype == q.dtype, "Output and query must have the same dtype"
+
+    # Convert HND layout to NHD if necessary (transpose only changes stride, not data)
+    if kv_layout == "HND":
+        # For HND: [..., H, N, D] -> NHD: [..., N, H, D]
+        k_cache = k_cache.transpose(-3, -2)
+        v_cache = v_cache.transpose(-3, -2)
+
     if (
         k_cache.dtype == torch.float8_e4m3fn
         and get_compute_capability(torch.device(device="cuda"))[0] == 9
@@ -239,7 +298,15 @@ def xqa(
         head_dim,
         head_group_ratio,
         use_sliding_window,
+        output.dtype,
+        q_seq_len,
     )
+
+    if q_seq_len > 1:
+        assert mask is not None, "Mask is required for speculative decoding"
+        if sinks is not None:
+            run_sm90_fp8_mha = False  # TODO: mha_sm90.cu has precision issue if sinks and speculative decoding are used simultaneously
+
     xqa_module.xqa(
         run_sm90_fp8_mha,
         sm_count,
@@ -247,6 +314,7 @@ def xqa(
         sliding_win_size if use_sliding_window else 0,
         q_scale,
         output,
+        rcp_out_scale,
         q,
         sinks,
         k_cache,
@@ -258,6 +326,9 @@ def xqa(
         kv_scale,
         semaphores,
         workspace_buffer,
+        enable_pdl,
+        q_seq_len,
+        mask,
     )
 
 
@@ -285,7 +356,7 @@ def get_xqa_module_mla(
     )
     def xqa_mla(
         sm_count: int,
-        q_scale: float,
+        q_scale: Union[float, torch.Tensor],
         output: torch.Tensor,
         q: torch.Tensor,
         k_cache: torch.Tensor,
@@ -294,13 +365,15 @@ def xqa_mla(
         max_seq_len: int,
         seq_lens: torch.Tensor,
         batch_size: int,
-        kv_scale: torch.Tensor,
+        kv_scale: Union[float, torch.Tensor],
         semaphores: torch.Tensor,
         workspace_buffer: torch.Tensor,
+        enable_pdl: bool,
     ) -> None:
         module.xqa_wrapper_mla(
             sm_count,
-            q_scale,
+            1.0 if isinstance(q_scale, torch.Tensor) else q_scale,
+            None if isinstance(q_scale, float) else q_scale,
             output,
             q,
             k_cache,
@@ -309,9 +382,11 @@ def xqa_mla(
             max_seq_len,
             seq_lens,
             batch_size,
-            kv_scale,
+            1.0 if isinstance(kv_scale, torch.Tensor) else kv_scale,
+            None if isinstance(kv_scale, float) else kv_scale,
             semaphores,
             workspace_buffer,
+            enable_pdl,
         )
 
     @register_fake_op(
@@ -319,7 +394,7 @@ def xqa_mla(
     )
     def _fake_xqa_mla(
         sm_count: int,
-        q_scale: float,
+        q_scale: Union[float, torch.Tensor],
         output: torch.Tensor,
         q: torch.Tensor,
         k_cache: torch.Tensor,
@@ -328,9 +403,10 @@ def _fake_xqa_mla(
         max_seq_len: int,
         seq_lens: torch.Tensor,
         batch_size: int,
-        kv_scale: torch.Tensor,
+        kv_scale: Union[float, torch.Tensor],
         semaphores: torch.Tensor,
         workspace_buffer: torch.Tensor,
+        enable_pdl: bool,
     ) -> None:
         pass
 
@@ -339,6 +415,7 @@ def _fake_xqa_mla(
     )
 
 
+@flashinfer_api
 def xqa_mla(
     q: torch.Tensor,
     k_cache: torch.Tensor,
@@ -349,9 +426,10 @@ def xqa_mla(
     workspace_buffer: torch.Tensor,
     semaphores: torch.Tensor,
     page_size: int,
-    q_scale: float = 1.0,
-    kv_scale: Optional[torch.Tensor] = None,
+    q_scale: Union[float, torch.Tensor] = 1.0,
+    kv_scale: Union[float, torch.Tensor] = 1.0,
     sm_count: Optional[int] = None,
+    enable_pdl: Optional[bool] = None,
 ) -> None:
     r"""Apply attention with paged KV cache using XQA MLA (Multi-Head Latent Attention) kernel.
     Parameters
@@ -384,15 +462,17 @@ def xqa_mla(
         Data type should be torch.uint32.
     page_size : int
         Size of each page in the paged KV cache. Must be one of [16, 32, 64, 128].
-    q_scale : float, default=1.0
+    q_scale : Union[float, torch.Tensor], default=1.0
         Scale factor for query tensor.
-    kv_scale : Optional[torch.Tensor], default=None
-        Scale factor for KV cache with shape ``[1]``.
-        Data type should be torch.float32.
-        If None, defaults to 1.0.
+    kv_scale : Union[float, torch.Tensor], default=1.0
+        Scale factor for KV cache.
     sm_count : Optional[int], default=None
         Number of streaming multiprocessors to use.
         If None, will be inferred from the device.
+    enable_pdl : Optional[bool], default=None
+        Whether to enable PDL (Persistent Data Loader) optimization.
+        If None, will be set to True if hardware supports it.
+
     Note
     ----
     The function automatically infers several parameters from tensor shapes:
@@ -405,8 +485,7 @@ def xqa_mla(
     if sm_count is None:
         sm_count = get_device_sm_count(q.device)
 
-    if kv_scale is None:
-        kv_scale = torch.ones(1, dtype=torch.float32, device=q.device)
+    enable_pdl = enable_pdl if enable_pdl is not None else device_support_pdl(q.device)
 
     # Infer parameters from tensors
     batch_size = q.shape[0]
@@ -446,4 +525,5 @@ def xqa_mla(
         kv_scale,
         semaphores,
         workspace_buffer,
+        enable_pdl,
     )
diff --git a/include/flashinfer/attention/batch_pod.cuh b/include/flashinfer/attention/batch_pod.cuh
new file mode 100644
index 0000000000..d8e0e12985
--- /dev/null
+++ b/include/flashinfer/attention/batch_pod.cuh
@@ -0,0 +1,394 @@
+#ifndef FLASHINFER_BATCH_POD_CUH_
+#define FLASHINFER_BATCH_POD_CUH_
+
+#include <cooperative_groups.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_fp8.h>
+#include <cuda_runtime.h>
+
+#include "../cp_async.cuh"
+#include "../fastdiv.cuh"
+#include "../frag_layout_swizzle.cuh"
+#include "../layout.cuh"
+#include "../math.cuh"
+#include "../mma.cuh"
+#include "../page.cuh"
+#include "../permuted_smem.cuh"
+#include "../pos_enc.cuh"
+#include "../utils.cuh"
+#include "cascade.cuh"
+#include "decode.cuh"
+#include "mask.cuh"
+#include "prefill.cuh"
+#include "variants.cuh"
+
+namespace flashinfer {
+
+namespace cg = cooperative_groups;
+using cp_async::SharedMemFillMode;
+using mma::MMAMode;
+
+enum Operation {
+  PREFILL = 0,
+  DECODE = 1,
+};
+
+template <typename KTraits_P, typename KTraits_D, typename PrefillParams, typename DecodeParams>
+__global__ __launch_bounds__(std::max(
+    KTraits_P::NUM_THREADS,
+    KTraits_D::NUM_THREADS)) void BatchPODWithKVCacheTensorKernel(const __grid_constant__
+                                                                      PrefillParams prefill_params,
+                                                                  const __grid_constant__
+                                                                      DecodeParams decode_params,
+                                                                  int* sm_aware_sched) {
+  extern __shared__ uint8_t smem[];
+  // PREFILL VARS
+  const uint32_t padded_bsize_p = prefill_params.padded_batch_size;
+  const uint32_t num_kv_heads_p = prefill_params.paged_kv.num_heads;
+
+  // DECODE VARS
+  const uint32_t padded_bsize_d = decode_params.padded_batch_size;
+  const uint32_t num_kv_heads_d = decode_params.paged_kv.num_heads;
+
+  // THREADBLOCKS
+  const uint32_t prefill_blocks = padded_bsize_p * num_kv_heads_p;
+  const uint32_t decode_blocks = padded_bsize_d * num_kv_heads_d;
+
+  int op;
+  int linear_bid;
+  // SM-aware CTA scheduler
+  if (threadIdx.x == 0) {
+    // TODO_AK: If num_threads dont match, use virtual sub-CTAs.
+    // Requires changing block-level sync in main prefill/decode kernels.
+    constexpr int blk_factor_p = 1;
+    constexpr int blk_factor_d = 1;
+
+    // SM-aware threadblock scheduler code
+    // Find out which SM this threadblock is scheduled on
+    int num_SMs;
+    // WARNING: nsmid has only been tested on A100/H100, and matches SM count
+    // No guarantee this will work on other GPUs
+    asm volatile("mov.u32 %0, %nsmid;" : "=r"(num_SMs));
+    asm volatile("mov.u32 %0, %smid;" : "=r"(linear_bid));
+    const int prefill_slots = (prefill_blocks + blk_factor_p - 1) / blk_factor_p;
+    const int decode_slots = (decode_blocks + blk_factor_d - 1) / blk_factor_d;
+
+    if (prefill_slots <= decode_slots) {
+      // Total tags = (decode + prefill) / min(decode, prefill)
+      // = 1 + decode / prefill; when prefill < decode
+      const int total_tags = decode_slots / prefill_slots + 1;
+      // For this SM, what's the next operation we want to run?
+      op = (atomicAdd(&sm_aware_sched[linear_bid], 1) % total_tags);
+      if (op > 0) {
+        op = 1;
+      }
+    } else {
+      // Total tags = (decode + prefill) / min(decode, prefill)
+      // = 1 + prefill / decode; when decode < prefill
+      const int pref_tags = prefill_slots / decode_slots;
+
+      // For this SM, what's the next operation we want to run?
+      op = (atomicAdd(&sm_aware_sched[linear_bid], 1) % (pref_tags + 1));
+      if (op < pref_tags) {
+        op = 0;
+      } else {
+        op = 1;
+      }
+    }
+
+    // Get the next blockId for that operation
+    linear_bid = atomicAdd(&sm_aware_sched[num_SMs + op], 1);
+    // If the blockId obtained exceeds the max blockIds for that op, switch to the other op
+    if (op == 0 && linear_bid >= prefill_slots) {
+      linear_bid = atomicAdd(&sm_aware_sched[num_SMs + 1], 1);
+      op = !op;
+    } else if (op == 1 && linear_bid >= decode_slots) {
+      op = !op;
+      linear_bid = atomicAdd(&sm_aware_sched[num_SMs + 0], 1);
+    }
+    // Write the blockId and operation to shared memory
+    ((int*)smem)[0] = linear_bid;
+    ((int*)smem)[1] = op;
+  }
+  // Sync to wait for dynamic scheduler to finish
+  __syncthreads();
+  // Fetch from shared memory the assigned blockId and operation.
+  linear_bid = ((int*)smem)[0];
+  op = ((int*)smem)[1];
+  // Sync to force all threads to wait
+  __syncthreads();
+
+  if (op == PREFILL) {
+    auto& smem_storage = reinterpret_cast<typename KTraits_P::SharedStorage&>(smem);
+    // dim3 nblks_d(padded_batch_size_d, 1, num_kv_heads);
+    if (linear_bid >= prefill_blocks) return;
+
+    const uint32_t bx = linear_bid % padded_bsize_p;
+    const uint32_t kv_head_idx = linear_bid / padded_bsize_p;
+
+    // dim3 nthrs_d(32, NUM_WARPS_Q_D, NUM_WARPS_KV_D);
+    const uint32_t linear_tid = threadIdx.x;
+    // Return if threadId exceeds number of threads for this op
+    if (linear_tid >= 32 * KTraits_P::NUM_WARPS_Q * KTraits_P::NUM_WARPS_KV) return;
+
+    const dim3 tid = dim3(linear_tid % 32, (linear_tid / 32) % KTraits_P::NUM_WARPS_Q,
+                          (linear_tid / 32) / KTraits_P::NUM_WARPS_Q);
+
+    BatchPrefillWithPagedKVCacheDevice<KTraits_P>(prefill_params, smem_storage, tid, bx,
+                                                  kv_head_idx, num_kv_heads_p);
+  } else /* OP == DECODE */ {
+    auto& smem_storage = reinterpret_cast<typename KTraits_D::SharedStorage&>(smem);
+    // dim3 nblks_d(padded_batch_size_d, 1, num_kv_heads);
+    if (linear_bid >= decode_blocks) return;
+
+    const uint32_t bx = linear_bid % padded_bsize_d;
+    const uint32_t kv_head_idx = linear_bid / padded_bsize_d;
+
+    // dim3 nthrs_d(32, NUM_WARPS_Q_D, NUM_WARPS_KV_D);
+    const uint32_t linear_tid = threadIdx.x;
+    // Return if threadId exceeds number of threads for this op
+    if (linear_tid >= 32 * KTraits_D::NUM_WARPS_Q * KTraits_D::NUM_WARPS_KV) return;
+
+    const dim3 tid = dim3(linear_tid % 32, (linear_tid / 32) % KTraits_D::NUM_WARPS_Q,
+                          (linear_tid / 32) / KTraits_D::NUM_WARPS_Q);
+
+    BatchPrefillWithPagedKVCacheDevice<KTraits_D>(decode_params, smem_storage, tid, bx, kv_head_idx,
+                                                  num_kv_heads_d);
+  }
+}
+
+template <uint32_t HEAD_DIM_QK, uint32_t HEAD_DIM_VO, PosEncodingMode POS_ENCODING_MODE,
+          bool USE_FP16_QK_REDUCTION, uint32_t CTA_TILE_Q_P, MaskMode MASK_MODE_P,
+          uint32_t CTA_TILE_Q_D, MaskMode MASK_MODE_D, typename PrefillAttentionVariant,
+          typename DecodeAttentionVariant, typename PrefillParams, typename DecodeParams>
+cudaError_t BatchPODWithKVCacheTensorDispatched(PrefillParams prefill_params,
+                                                typename PrefillParams::DTypeO* tmp_v_p,
+                                                float* tmp_s_p, DecodeParams decode_params,
+                                                typename DecodeParams::DTypeO* tmp_v_d,
+                                                float* tmp_s_d, bool enable_pdl,
+                                                cudaStream_t stream, int* sm_aware_sched) {
+  static_assert(std::is_same<typename PrefillParams::DTypeQ, typename DecodeParams::DTypeQ>::value);
+  static_assert(
+      std::is_same<typename PrefillParams::DTypeKV, typename DecodeParams::DTypeKV>::value);
+  static_assert(std::is_same<typename PrefillParams::DTypeO, typename DecodeParams::DTypeO>::value);
+  // Ensure heads match
+  assert(prefill_params.paged_kv.num_heads == decode_params.paged_kv.num_heads);
+  assert(prefill_params.num_qo_heads == decode_params.num_qo_heads);
+  // Common variables for both prefill and decode
+  const uint32_t num_qo_heads = prefill_params.num_qo_heads;
+  const uint32_t num_kv_heads = prefill_params.paged_kv.num_heads;
+
+  int dev_id = 0;
+  FLASHINFER_CUDA_CALL(cudaGetDevice(&dev_id));
+  int max_smem_per_sm = 0;
+  FLASHINFER_CUDA_CALL(cudaDeviceGetAttribute(&max_smem_per_sm,
+                                              cudaDevAttrMaxSharedMemoryPerMultiprocessor, dev_id));
+
+  // Prefill variable setup
+  using DTypeQ_P = typename PrefillParams::DTypeQ;
+  using DTypeKV_P = typename PrefillParams::DTypeKV;
+  using DTypeO_P = typename PrefillParams::DTypeO;
+  const uint32_t padded_batch_size_p = prefill_params.padded_batch_size;
+  constexpr uint32_t NUM_MMA_Q_P = get_num_mma_q(CTA_TILE_Q_P);
+  constexpr uint32_t NUM_WARPS_Q_P = get_num_warps_q(CTA_TILE_Q_P);
+  constexpr uint32_t NUM_WARPS_KV_P = get_num_warps_kv(CTA_TILE_Q_P);
+
+  using DTypeQKAccum_P =
+      typename std::conditional<USE_FP16_QK_REDUCTION && std::is_same_v<DTypeQ_P, half>, half,
+                                float>::type;
+
+  const uint32_t group_size = num_qo_heads / num_kv_heads;
+  const uint_fastdiv group_size_fastdiv(group_size);
+  constexpr uint32_t NUM_MMA_D_QK = HEAD_DIM_QK / 16;
+  constexpr uint32_t NUM_MMA_D_VO = HEAD_DIM_VO / 16;
+
+  // Decode vars setup
+  using DTypeQ_D = typename DecodeParams::DTypeQ;
+  using DTypeKV_D = typename DecodeParams::DTypeKV;
+  using DTypeO_D = typename DecodeParams::DTypeO;
+  const uint32_t padded_batch_size_d = decode_params.padded_batch_size;
+  constexpr uint32_t NUM_MMA_Q_D = get_num_mma_q(CTA_TILE_Q_D);
+  constexpr uint32_t NUM_WARPS_Q_D = get_num_warps_q(CTA_TILE_Q_D);
+  constexpr uint32_t NUM_WARPS_KV_D = get_num_warps_kv(CTA_TILE_Q_D);
+
+  // constexpr uint32_t NUM_MMA_D_QK = HEAD_DIM_QK / 16;
+  // constexpr uint32_t NUM_MMA_D_VO = HEAD_DIM_VO / 16;
+  using DTypeQKAccum_D =
+      typename std::conditional<USE_FP16_QK_REDUCTION && std::is_same_v<DTypeQ_D, half>, half,
+                                float>::type;
+
+  // we expect each sm execute two threadblocks
+  // TODO(Zihao): fix the following computation
+  const int num_ctas_per_sm = max_smem_per_sm > (16 * HEAD_DIM_QK * sizeof(DTypeQ_D) * 16) ? 2 : 1;
+  const int max_smem_per_threadblock = max_smem_per_sm / num_ctas_per_sm;
+
+  // Prefill params
+  constexpr uint32_t max_num_mma_kv_reg_p =
+      (HEAD_DIM_VO >= 128 && NUM_MMA_Q_P == 2 && POS_ENCODING_MODE == PosEncodingMode::kRoPELlama &&
+       !USE_FP16_QK_REDUCTION)
+          ? 2
+          : (8 / NUM_MMA_Q_P);
+  const uint32_t max_num_mma_kv_smem_p =
+      (max_smem_per_threadblock / (16 * HEAD_DIM_QK * sizeof(DTypeQ_P)) -
+       NUM_MMA_Q_P * NUM_WARPS_Q_P) /
+      (2 * NUM_WARPS_KV_P);
+
+  // Decode params
+  constexpr uint32_t max_num_mma_kv_reg_d =
+      (HEAD_DIM_VO >= 128 && NUM_MMA_Q_D == 2 && POS_ENCODING_MODE == PosEncodingMode::kRoPELlama &&
+       !USE_FP16_QK_REDUCTION)
+          ? 2
+          : (8 / NUM_MMA_Q_D);
+  // TODO(Zihao): fix the following computation
+  const uint32_t max_num_mma_kv_smem_d =
+      (max_smem_per_threadblock / (16 * HEAD_DIM_QK * sizeof(DTypeQ_D)) -
+       NUM_MMA_Q_D * NUM_WARPS_Q_D) /
+      (2 * NUM_WARPS_KV_D);
+
+  // control NUM_MMA_KV for maximum warp occupancy
+  DISPATCH_NUM_MMA_KV(min(max_num_mma_kv_smem_p, max_num_mma_kv_reg_p), NUM_MMA_KV_P, {
+    using KTraits_P = KernelTraits<MASK_MODE_P, CTA_TILE_Q_P, NUM_MMA_Q_P, NUM_MMA_KV_P,
+                                   NUM_MMA_D_QK, NUM_MMA_D_VO, NUM_WARPS_Q_P, NUM_WARPS_KV_P,
+                                   POS_ENCODING_MODE, DTypeQ_P, DTypeKV_P, DTypeO_P, DTypeQKAccum_P,
+                                   typename PrefillParams::IdType, PrefillAttentionVariant>;
+
+    if constexpr (KTraits_P::IsInvalid()) {
+      // Invalid configuration, skip
+      std::ostringstream err_msg;
+      err_msg << "FlashInfer Internal Error: Invalid configuration : NUM_MMA_Q=" << NUM_MMA_Q_P
+              << " NUM_MMA_D_QK=" << NUM_MMA_D_QK << " NUM_MMA_D_VO=" << NUM_MMA_D_VO
+              << " NUM_MMA_KV=" << NUM_MMA_KV_P << " NUM_WARPS_Q=" << NUM_WARPS_Q_P
+              << " NUM_WARPS_KV=" << NUM_WARPS_KV_P
+              << " please create an issue (https://github.com/flashinfer-ai/flashinfer/issues)"
+                 " and report the issue to the developers.";
+      FLASHINFER_ERROR(err_msg.str());
+    } else {
+      // Decode stuff
+      // TODO: Is there a way to avoid this nested dispatch?
+      DISPATCH_NUM_MMA_KV(min(max_num_mma_kv_smem_d, max_num_mma_kv_reg_d), NUM_MMA_KV_D, {
+        using KTraits_D =
+            KernelTraits<MASK_MODE_D, CTA_TILE_Q_D, NUM_MMA_Q_D, NUM_MMA_KV_D, NUM_MMA_D_QK,
+                         NUM_MMA_D_VO, NUM_WARPS_Q_D, NUM_WARPS_KV_D, POS_ENCODING_MODE, DTypeQ_D,
+                         DTypeKV_D, DTypeO_D, DTypeQKAccum_D, typename DecodeParams::IdType,
+                         DecodeAttentionVariant>;
+        if constexpr (KTraits_D::IsInvalid()) {
+          // Invalid configuration, skip
+          std::ostringstream err_msg;
+          err_msg << "FlashInfer Internal Error: Invalid configuration : NUM_MMA_Q=" << NUM_MMA_Q_D
+                  << " NUM_MMA_D_QK=" << NUM_MMA_D_QK << " NUM_MMA_D_VO=" << NUM_MMA_D_VO
+                  << " NUM_MMA_KV=" << NUM_MMA_KV_D << " NUM_WARPS_Q=" << NUM_WARPS_Q_D
+                  << " NUM_WARPS_KV=" << NUM_WARPS_KV_D
+                  << " please create an issue (https://github.com/flashinfer-ai/flashinfer/issues)"
+                     " and report the issue to the developers.";
+          FLASHINFER_ERROR(err_msg.str());
+        } else {
+          // End decode stuff
+          constexpr uint32_t num_threads_p = (NUM_WARPS_Q_P * NUM_WARPS_KV_P) * WARP_SIZE;
+          size_t smem_size_p = sizeof(typename KTraits_P::SharedStorage);
+          size_t smem_size_d = sizeof(typename KTraits_D::SharedStorage);
+
+          auto kernel =
+              BatchPODWithKVCacheTensorKernel<KTraits_P, KTraits_D, PrefillParams, DecodeParams>;
+
+          // Setup new prefill params if (not) split
+          auto o_p = prefill_params.o;
+          auto lse_p = prefill_params.lse;
+          if (tmp_v_p == nullptr) {
+            // do not partition kv
+            prefill_params.partition_kv = false;
+          } else {
+            prefill_params.partition_kv = true;
+            prefill_params.o = tmp_v_p;
+            prefill_params.lse = tmp_s_p;
+          }
+
+          // Setup new decode params if (not) split
+          auto o_d = decode_params.o;
+          auto lse_d = decode_params.lse;
+          if (tmp_v_d == nullptr) {
+            // do not partition kv
+            decode_params.partition_kv = false;
+          } else {
+            decode_params.partition_kv = true;
+            decode_params.o = tmp_v_d;
+            decode_params.lse = tmp_s_d;
+          }
+          int nblks_p(padded_batch_size_p * 1 * num_kv_heads);
+          int nthrs_p(32 * NUM_WARPS_Q_P * NUM_WARPS_KV_P);
+
+          int nblks_d(padded_batch_size_d * 1 * num_kv_heads);
+          int nthrs_d(32 * NUM_WARPS_Q_D * NUM_WARPS_KV_D);
+
+          // ******* Select final combined sizes here ******* /
+          size_t smem_size = max(smem_size_p, smem_size_d);
+          int nblks = nblks_p + nblks_d;
+          int nthrs = max(nthrs_p, nthrs_d);
+          //  ************************************************ /
+
+          int num_sm = 0;
+          FLASHINFER_CUDA_CALL(
+              cudaDeviceGetAttribute(&num_sm, cudaDevAttrMultiProcessorCount, dev_id));
+          FLASHINFER_CUDA_CALL(
+              cudaMemsetAsync(sm_aware_sched, 0, sizeof(int) * (num_sm + 2), stream));
+
+          // Setup kernel arguments
+          void* args[] = {(void*)&prefill_params, (void*)&decode_params, (void*)&sm_aware_sched};
+          FLASHINFER_CUDA_CALL(
+              cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+
+          // Launch kernel
+          if (enable_pdl) {
+            cudaLaunchAttribute attribute[1];
+            cudaLaunchConfig_t config;
+            attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+            attribute[0].val.programmaticStreamSerializationAllowed = 1;
+            config.attrs = attribute;
+            config.numAttrs = 1;
+            config.gridDim = nblks;
+            config.blockDim = nthrs;
+            config.dynamicSmemBytes = smem_size;
+            config.stream = stream;
+            FLASHINFER_CUDA_CALL(
+                cudaLaunchKernelEx(&config, kernel, prefill_params, decode_params, sm_aware_sched));
+          } else {
+            FLASHINFER_CUDA_CALL(
+                cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));
+          }
+
+          // Post-kernel stuff for split-kv prefill
+          if (tmp_v_p != nullptr) {
+            if constexpr (PrefillAttentionVariant::use_softmax) {
+              FLASHINFER_CUDA_CALL(VariableLengthMergeStates(
+                  tmp_v_p, tmp_s_p, prefill_params.merge_indptr, o_p, lse_p,
+                  prefill_params.max_total_num_rows, prefill_params.total_num_rows, num_qo_heads,
+                  HEAD_DIM_VO, enable_pdl, stream));
+            } else {
+              FLASHINFER_CUDA_CALL(VariableLengthAttentionSum(
+                  tmp_v_p, prefill_params.merge_indptr, o_p, prefill_params.max_total_num_rows,
+                  prefill_params.total_num_rows, num_qo_heads, HEAD_DIM_VO, enable_pdl, stream));
+            }
+          }
+          // Post-kernel stuff for split-kv decode
+          if (tmp_v_d != nullptr) {
+            if constexpr (DecodeAttentionVariant::use_softmax) {
+              FLASHINFER_CUDA_CALL(VariableLengthMergeStates(
+                  tmp_v_d, tmp_s_d, decode_params.merge_indptr, o_d, lse_d,
+                  decode_params.max_total_num_rows, decode_params.total_num_rows, num_qo_heads,
+                  HEAD_DIM_VO, enable_pdl, stream));
+            } else {
+              FLASHINFER_CUDA_CALL(VariableLengthAttentionSum(
+                  tmp_v_d, decode_params.merge_indptr, o_d, decode_params.max_total_num_rows,
+                  decode_params.total_num_rows, num_qo_heads, HEAD_DIM_VO, enable_pdl, stream));
+            }
+          }
+        }
+      });
+    }
+  });
+  return cudaSuccess;
+}
+
+}  // namespace flashinfer
+
+#endif  // FLASHINFER_BATCH_POD_CUH_
diff --git a/include/flashinfer/attention/hopper/block_sparse_gather.cuh b/include/flashinfer/attention/hopper/block_sparse_gather.cuh
deleted file mode 100644
index 7082f1da1f..0000000000
--- a/include/flashinfer/attention/hopper/block_sparse_gather.cuh
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * Modified by the FlashInfer team.
- */
-#ifndef FLASHINFER_ATTENTION_HOPPER_BLOCK_SPARSE_GATHER_CUH
-#define FLASHINFER_ATTENTION_HOPPER_BLOCK_SPARSE_GATHER_CUH
-
-#include <cstdint>
-
-#include "cute/layout.hpp"
-#include "cute/tensor.hpp"
-#include "cute/util/print.hpp"
-#include "cutlass/fast_math.h"
-
-namespace flashinfer {
-
-using namespace cute;
-
-template <class IdType>
-struct BlockSparseIndexedGather {
-  CUTE_HOST_DEVICE constexpr BlockSparseIndexedGather(IdType const* indices) : indices_(indices) {}
-
-  template <typename I>
-  CUTE_HOST_DEVICE constexpr IdType operator()(I i) const {
-    // NOTE(Zihao): there is a risk of out-of-bound access, adding boundary check here
-    // would degrade performance significantly. It is the user's responsibility to ensure
-    // that (indptr[-2] + TILE_KV) is less than the size of the indices tensor.
-    return indices_[i];
-  }
-
-  CUTE_HOST_DEVICE friend void print(BlockSparseIndexedGather const& s) {
-    cute::print("BlockSparseIndexedGather");
-  }
-
-  IdType const* indices_;
-};
-
-/// Custom stride object that applies a function followed by a stride
-template <class Func>
-struct CustomStride {
-  CUTE_HOST_DEVICE constexpr CustomStride(Func const& func, int stride_n)
-      : func_(func), stride_n_(stride_n) {}
-
-  template <class I>
-  CUTE_HOST_DEVICE friend auto operator*(I i, CustomStride const& s) {
-    //     uint64_t ret;
-    // #if defined(__CUDA_ARCH__)
-    //     asm("{\n\t"
-    //         "mul.wide.u32 %0, %1, %2;\n\t"
-    //         "}" : "=l"(ret) : "r"(s.func_(i)), "r"(s.stride_n_));
-    // #else
-    //     ret = uint64_t(s.func_(i)) * uint64_t(s.stride_n_);
-    // #endif
-    //     return ret;
-
-    // NOTE(Zihao): if the tensor is larger than 64GB ((2 ** 32) * 16byte), we use
-    // 64-bit multiplication to avoid overflow. Otherwise, 32-bit multiplication is
-    // sufficient.
-    // There is a 20+ TFLOPs/s gap between 32-bit and 64-bit multiplication on H100.
-    return uint32_t(s.func_(i)) * s.stride_n_;
-  }
-
-  template <class I>
-  CUTE_HOST_DEVICE friend auto operator*(CustomStride const& s, I i) {
-    //     uint64_t ret;
-    // #if defined(__CUDA_ARCH__)
-    //     asm("{\n\t"
-    //         "mul.wide.u32 %0, %1, %2;\n\t"
-    //         "}" : "=l"(ret) : "r"(s.func_(i)), "r"(s.stride_n_));
-    // #else
-    //     ret = uint64_t(s.func_(i)) * uint64_t(s.stride_n_);
-    // #endif
-    //     return ret;
-
-    // NOTE(Zihao): if the tensor is larger than 64GB = (2 ** 32) * 16byte (16byte is the
-    // element size after upcasting), we use 64-bit multiplication to avoid overflow. Otherwise,
-    // 32-bit multiplication is sufficient.
-    // There is a 20+ TFLOPs/s gap between 32-bit and 64-bit multiplication on H100.
-    return uint32_t(s.func_(i)) * s.stride_n_;
-  }
-
-  CUTE_HOST_DEVICE friend void print(CustomStride const& s) {
-    cute::print("BlockSparseStride{");
-    print(s.func_);
-    cute::print(",");
-    print(s.stride_n_);
-    cute::print("}");
-  }
-
-  template <class Div>
-  CUTE_HOST_DEVICE constexpr friend auto safe_div(CustomStride const& s, Div const& div) {
-    return CustomStride<Func>(s.func_, safe_div(s.stride_n_, div));
-  }
-
-  // Circumvent the requirement on make_layout that shape and stride are integral
-  template <class Shape>
-  CUTE_HOST_DEVICE constexpr friend auto make_layout(Shape const& shape,
-                                                     CustomStride const& stride) {
-    return Layout<Shape, CustomStride>(shape, stride);
-  }
-
-  Func func_;
-  uint32_t stride_n_;
-};
-
-template <class Func>
-CUTLASS_HOST_DEVICE auto make_custom_stride_layout(int stride_n, Func&& func) {
-  return make_layout(make_shape(_1{}, _1{}),
-                     make_stride(CustomStride(static_cast<Func&&>(func), stride_n), _1{}));
-}
-
-/// Helper function to optionally create a block sparse gather tensor
-template <class Iterator, class Shape, class Func>
-CUTLASS_HOST_DEVICE auto make_block_sparse_tensor(Iterator iter, Shape const& shape, int stride_n,
-                                                  Func&& func) {
-  Layout matrix_layout = make_identity_layout(shape);
-  auto offset = as_arithmetic_tuple(repeat_like(shape, _0{}));
-  Layout gather_layout = make_custom_stride_layout(stride_n, static_cast<Func&&>(func));
-
-  return make_tensor(iter, ComposedLayout{gather_layout, offset, matrix_layout});
-}
-
-}  // namespace flashinfer
-
-namespace cute {
-
-template <int N, int I, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr auto upcast(Shape const& shape, Stride const& stride) {
-  if constexpr (is_tuple<Shape>::value) {
-    return transform_layout(shape, stride,
-                            [](auto const& s, auto const& d) { return upcast<N, I>(s, d); });
-  } else if constexpr (is_scaled_basis<Stride>::value) {
-    if constexpr (Stride::mode() == I) {
-      return make_layout(ceil_div(shape, Int<N>{}), ceil_div(stride, Int<N>{}));
-    } else {
-      return make_layout(shape, stride);
-    }
-  } else {
-    return upcast<N>(shape, stride);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <int N, class OuterShape, class OuterStride, class Offset, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr auto upcast(
-    ComposedLayout<Layout<OuterShape, OuterStride>, Offset, Layout<Shape, Stride>> const& layout) {
-  // Find index of the stride-1 mode - that is the only one that requires updating inner shape and
-  // offset
-  auto idx =
-      find_if(layout.layout_a().stride(), [](auto x) { return is_constant<1, decltype(x)>{}; });
-  constexpr int I = decltype(idx)::value;
-
-  // Upcast the outer layout (works as expected)
-  auto outer = upcast<N>(layout.layout_a());
-
-  // Upcast the accumulated offset along stride-1 mode
-  auto offset =
-      as_arithmetic_tuple(replace<I>(layout.offset(), upcast<N>(get<I>(layout.offset()))));
-
-  // Upcast the inner layout's shape along stride-1 mode
-  auto inner = upcast<N, I>(layout.layout_b().shape(), layout.layout_b().stride());
-
-  return composition(outer, offset, inner);
-}
-
-}  // namespace cute
-
-#endif  // FLASHINFER_ATTENTION_HOPPER_BLOCK_SPARSE_GATHER_CUH
diff --git a/include/flashinfer/attention/hopper/default_params.cuh b/include/flashinfer/attention/hopper/default_params.cuh
index f2b9d2e33e..bb2a33ac2c 100644
--- a/include/flashinfer/attention/hopper/default_params.cuh
+++ b/include/flashinfer/attention/hopper/default_params.cuh
@@ -154,6 +154,11 @@ struct BatchPrefillPagedParams {
   int64_t o_stride_h;
   int64_t nnz_qo;
 
+  // NOTE: For sparse paged KV cache, we need the stride between pages
+  // This is paged_k_cache.stride(0), not the layout stride
+  int64_t k_page_stride;  // Stride between pages for K
+  int64_t v_page_stride;  // Stride between pages for V
+
   int num_qo_heads;
   int num_kv_heads;
   int group_size;
diff --git a/include/flashinfer/attention/hopper/epilogue.cuh b/include/flashinfer/attention/hopper/epilogue.cuh
index 81e43bd9a7..18bf3de638 100644
--- a/include/flashinfer/attention/hopper/epilogue.cuh
+++ b/include/flashinfer/attention/hopper/epilogue.cuh
@@ -168,7 +168,7 @@ struct CollectiveEpilogue {
                                       /*id=*/static_cast<int>(NamedBarriers::kValueEmpty));
     cute::copy(smem_tiled_copy_O, tOrO_retile, tOsO);
     cutlass::arch::fence_view_async_shared();  // ensure smem writes are visible to TMA
-    cutlass::arch::NamedBarrier::arrive(NUM_MMA_THREADS + Ktraits::NUM_PRODUCER_THREADS,
+    cutlass::arch::NamedBarrier::arrive(NUM_MMA_THREADS,
                                         cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
 
     Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_params.lse_ptr), epilogue_params.layout_LSE);
@@ -195,10 +195,9 @@ struct CollectiveEpilogue {
     }
 
     int write_warp_idx = NUM_WARPS - 1;
-    if (cutlass::canonical_warp_idx_sync() == write_warp_idx) {
-      cutlass::arch::NamedBarrier::sync(NUM_MMA_THREADS + Ktraits::NUM_PRODUCER_THREADS,
-                                        cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
-    }
+    // Make sure all MMA WGs finish STSM O
+    cutlass::arch::NamedBarrier::sync(NUM_MMA_THREADS,
+                                      cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
     TiledCopyO gmem_tiled_copy_O;
     write_O<NUM_COPY_THREADS>(epilogue_params.O_ptr, gmem_tiled_copy_O, epilogue_params.layout_O,
                               select<0, 1>(TileShape_PDV{}), sO, thread_idx, qo_tile_idx,
diff --git a/include/flashinfer/attention/hopper/kernel_traits.cuh b/include/flashinfer/attention/hopper/kernel_traits.cuh
index 2ac599ca64..abf164f61d 100644
--- a/include/flashinfer/attention/hopper/kernel_traits.cuh
+++ b/include/flashinfer/attention/hopper/kernel_traits.cuh
@@ -63,7 +63,8 @@ struct AttentionKernelTraits {
   static constexpr int NUM_THREADS = NUM_WARPS * cutlass::NumThreadsPerWarp;
   // NOTE(Zihao): the following constant should only be used when TMA is enabled,
   // where only one warp inside a warp group is used for TMA.
-  static constexpr int NUM_PRODUCER_THREADS = cutlass::NumThreadsPerWarp;
+  static constexpr int NUM_PRODUCER_THREADS =
+      USE_TMA_LOAD_KV ? cutlass::NumThreadsPerWarp : 4 * cutlass::NumThreadsPerWarp;
 
   using TileShape_QKD = Shape<Int<CTA_Q>, Int<CTA_KV>, Int<HEAD_DIM_QK>>;
   using TileShape_PDV = Shape<Int<CTA_Q>, Int<HEAD_DIM_VO>, Int<CTA_KV>>;
diff --git a/include/flashinfer/attention/hopper/mainloop_mma.cuh b/include/flashinfer/attention/hopper/mainloop_mma.cuh
index 0348e7b8dc..27522f3187 100644
--- a/include/flashinfer/attention/hopper/mainloop_mma.cuh
+++ b/include/flashinfer/attention/hopper/mainloop_mma.cuh
@@ -12,6 +12,8 @@
 #include <cutlass/numeric_conversion.h>
 #include <cutlass/numeric_types.h>
 
+#include "variants.cuh"
+
 namespace flashinfer {
 
 template <typename Ktraits, bool LEFT_SLIDING_WINDOW, bool CAUSAL, bool MULTIITEMSCORING,
@@ -316,7 +318,7 @@ CUTLASS_DEVICE void mma_f16(
   consumer_wait(pipeline_v, smem_pipe_read_v);
   gemm</*init=*/false, /*wg_wait=*/-1>(tiled_mma_pv, tOrP, tOrV(_, _, _, smem_pipe_read_v.index()),
                                        tOrO);
-  attention_updater.finalize(tSrS);
+  attention_updater.finalize(tSrS, get_variant_scale_pv(variant));
   warpgroup_wait<0>();
   pipeline_v.consumer_release(smem_pipe_read_v);  // release V, otherwise producers will hang
   ++smem_pipe_read_v;
diff --git a/include/flashinfer/attention/hopper/prefill_sm90.cuh b/include/flashinfer/attention/hopper/prefill_sm90.cuh
index c9bee9466c..f1e441a53b 100644
--- a/include/flashinfer/attention/hopper/prefill_sm90.cuh
+++ b/include/flashinfer/attention/hopper/prefill_sm90.cuh
@@ -379,7 +379,11 @@ cudaError_t BatchPrefillWithPagedKVCacheKernelTraitsDispatched(Params& params,
        params.v_ptr,
        get_gmem_layout(/*nnz=*/0, params.num_kv_heads, KernelTraits::HEAD_DIM_VO, params.v_stride_n,
                        params.v_stride_h),  // layout_V
-       params.kv_indices, params.window_left, params.additional_params});
+       params.kv_indices, params.window_left,
+       params.k_page_stride,                     // Stride between pages for K
+       params.v_page_stride,                     // Stride between pages for V
+       static_cast<uint32_t>(params.page_size),  // Page size
+       params.additional_params});
   typename CollectiveEpilogue::Params epilogue_params =
       CollectiveEpilogue::to_underlying_arguments({
           params.o_ptr,
diff --git a/include/flashinfer/attention/hopper/quantization/mainloop_sparse_load.cuh b/include/flashinfer/attention/hopper/quantization/mainloop_sparse_load.cuh
index 1b17842443..cc7f6cd0ad 100644
--- a/include/flashinfer/attention/hopper/quantization/mainloop_sparse_load.cuh
+++ b/include/flashinfer/attention/hopper/quantization/mainloop_sparse_load.cuh
@@ -26,7 +26,6 @@
 #include <cutlass/pipeline/pipeline.hpp>
 
 #include "../../../math.cuh"
-#include "../block_sparse_gather.cuh"
 #include "../named_barrier.cuh"
 #include "../utils.cuh"
 #include "kernel_traits.cuh"
@@ -99,10 +98,15 @@ struct FP8SparseCollectiveMainloop {
     DTypeQ const* Q_ptr;
     LayoutT layout_Q;
     DTypeKV const* K_ptr;
-    LayoutT layout_K;
+    int64_t k_stride_n;     // Stride between consecutive KV tokens
+    int64_t k_stride_h;     // Stride between heads
+    int64_t k_page_stride;  // Stride between pages
     DTypeKV const* V_ptr;
-    LayoutT layout_V;
+    int64_t v_stride_n;     // Stride between consecutive KV tokens
+    int64_t v_stride_h;     // Stride between heads
+    int64_t v_page_stride;  // Stride between pages
     IdType const* kv_indices;
+    uint32_t page_size;  // Size of each page
     int window_left;
     AdditionalParams additional_params;
   };
@@ -110,12 +114,17 @@ struct FP8SparseCollectiveMainloop {
   // Device side kernel params
   struct Params {
     LayoutT layout_Q;
-    LayoutT layout_K;
-    LayoutT layout_V;
     TMA_Q tma_load_Q;
     DTypeKV* K_ptr;
+    int64_t k_stride_n;
+    int64_t k_stride_h;
+    int64_t k_page_stride;
     DTypeKV* V_ptr;
+    int64_t v_stride_n;
+    int64_t v_stride_h;
+    int64_t v_page_stride;
     IdType* kv_indices;
+    uint_fastdiv page_size;  // Size of each page (as fastdiv for efficient divmod)
     int window_left;
     AdditionalParams additional_params;
     using DTypeKV = typename Ktraits::DTypeKV;
@@ -126,12 +135,17 @@ struct FP8SparseCollectiveMainloop {
     TMA_Q tma_load_Q =
         make_tma_copy(GmemTiledCopyQ{}, mQ, SmemLayoutQ{}, select<0, 2>(TileShape_QKD{}), _1{});
     return {args.layout_Q,
-            args.layout_K,
-            args.layout_V,
             tma_load_Q,
             const_cast<DTypeKV*>(args.K_ptr),
+            args.k_stride_n,
+            args.k_stride_h,
+            args.k_page_stride,
             const_cast<DTypeKV*>(args.V_ptr),
+            args.v_stride_n,
+            args.v_stride_h,
+            args.v_page_stride,
             const_cast<IdType*>(args.kv_indices),
+            args.page_size,
             args.window_left,
             args.additional_params};
   }
@@ -208,44 +222,123 @@ struct FP8SparseCollectiveMainloop {
 
     constexpr int HEAD_DIM = get<2>(TileShape_QKD{});
     constexpr int CTA_KV = get<1>(TileShape_QKD{});
-    auto indexed_gather = BlockSparseIndexedGather<IdType>(mainloop_params.kv_indices + kv_indptr);
+    IdType const* kv_indices_ptr = mainloop_params.kv_indices + kv_indptr;
 
-    Tensor mK = make_block_sparse_tensor(  // (kv_len, D)
-        make_gmem_ptr(mainloop_params.K_ptr + kv_head_idx * stride<2>(mainloop_params.layout_K)),
-        make_shape(kv_len, HEAD_DIM), stride<0>(mainloop_params.layout_K), indexed_gather);
-    Tensor mV = make_block_sparse_tensor(  // (kv_len, D)
-        make_gmem_ptr(mainloop_params.V_ptr + kv_head_idx * stride<2>(mainloop_params.layout_V)),
-        make_shape(kv_len, HEAD_DIM), stride<0>(mainloop_params.layout_V), indexed_gather);
-
-    Tensor gK = local_tile(mK, select<1, 2>(TileShape_QKD{}), make_coord(_, _0{}));  // (KV, D, kv)
-    Tensor gV = local_tile(mV, select<1, 2>(TileShape_QKD{}), make_coord(_, _0{}));  // (KV, D, kv)
-    Tensor cKV = cute::make_identity_tensor(gK.shape());
+    // Setup for manual K/V loading with page table
+    // Add kv_head_idx * stride_h offset to base pointers for correct head addressing
+    DTypeKV* k_base_ptr = mainloop_params.K_ptr + kv_head_idx * mainloop_params.k_stride_h;
+    DTypeKV* v_base_ptr = mainloop_params.V_ptr + kv_head_idx * mainloop_params.v_stride_h;
+    int64_t k_stride_n = mainloop_params.k_stride_n;
+    int64_t k_page_stride = mainloop_params.k_page_stride;
+    int64_t v_stride_n = mainloop_params.v_stride_n;
+    int64_t v_page_stride = mainloop_params.v_page_stride;
 
     GmemTiledCopyKV gmem_tiled_copy_kv;
     auto gmem_thr_copy_kv = gmem_tiled_copy_kv.get_slice(thread_idx);
 
-    Tensor tKgK = gmem_thr_copy_kv.partition_S(gK);     // (CPY, CPY_KV, CPY_D, kv)
-    Tensor tKsK = gmem_thr_copy_kv.partition_D(sK);     // (CPY, CPY_KV, CPY_D, PIPE)
-    Tensor tVgV = gmem_thr_copy_kv.partition_S(gV);     // (CPY, CPY_KV, CPY_D, kv)
-    Tensor tVsV = gmem_thr_copy_kv.partition_D(sV);     // (CPY, CPY_KV, CPY_D, PIPE)
+    // Create coordinate tensors for partitioning
+    Tensor cKV = cute::make_identity_tensor(make_shape(CTA_KV, HEAD_DIM));
     Tensor tKVcKV = gmem_thr_copy_kv.partition_D(cKV);  // (CPY, CPY_KV, CPY_D)
     Tensor tKVcKVGroup = flatten_1(tKVcKV);             // (CPY, (CPY_KV, CPY_D))
+    Tensor tKsK = gmem_thr_copy_kv.partition_D(sK);     // (CPY, CPY_KV, CPY_D, PIPE)
+    Tensor tVsV = gmem_thr_copy_kv.partition_D(sV);     // (CPY, CPY_KV, CPY_D, PIPE)
 
-    int valid_last_kv_tile_size = std::min<int>(kv_len - kv_tile_idx * CTA_KV, CTA_KV);
-    auto predicate_fn = [&](auto coords) {
-      auto s_coords = tKVcKVGroup(_0{}, coords);
-      return elem_less(get<0>(s_coords), valid_last_kv_tile_size);
+    // FA3-style prefetch offset optimization: pre-compute page offsets and share via shuffle
+    // This reduces redundant page table lookups and address calculations
+    int64_t my_kv_offset[2];  // Rolling buffer: page_idx * page_stride + entry_idx * stride_n
+    int parity = 0;           // Buffer parity for double buffering, toggled with ^= 1
+
+    // Group organization based on partition strategy (same as FP16 sparse_mainloop)
+    // For FP8 with cp.async: AlignmentKV=16 (128bits/8bits), NUM_PRODUCER_THREADS=128
+    // The simt gmem tiled copy partitions threads as: (thread_stride_M, thread_stride_K)
+    // where thread_stride_M = threads / (CTA_KV / AlignmentKV) for column-major
+    // NUM_KV_PER_ITER = number of KV elements each thread handles per iteration
+    //
+    // The tiled copy arrangement:
+    // - Each thread loads AlignmentKV (16) elements contiguously in the D dimension
+    // - Threads are spread across the (KV, D) tile
+    // For column-major: threads stride by (D/AlignmentKV) in the KV dimension
+    // D_stride = HEAD_DIM / AlignmentKV (e.g., 128/16=8 or 256/16=16)
+    // Thread arrangement: threads = KV_stride * D_stride
+    // So KV_stride = NUM_COPY_THREADS / D_stride = NUM_COPY_THREADS * AlignmentKV / HEAD_DIM
+    // NUM_KV_PER_ITER = CTA_KV / KV_stride = CTA_KV * HEAD_DIM / (NUM_COPY_THREADS * AlignmentKV)
+    static constexpr int NUM_COPY_THREADS = Ktraits::NUM_PRODUCER_THREADS;
+    constexpr int NUM_KV_PER_ITER = CTA_KV * HEAD_DIM / (NUM_COPY_THREADS * AlignmentKV);
+    constexpr int KV_STRIDE = CTA_KV / NUM_KV_PER_ITER;
+    constexpr int NUM_GROUPS = KV_STRIDE;
+    constexpr int THREADS_PER_GROUP = NUM_COPY_THREADS / NUM_GROUPS;
+    constexpr int NUM_ITERS_PER_GROUP = NUM_KV_PER_ITER;
+
+    int group_id = thread_idx / THREADS_PER_GROUP;
+    int thread_in_group = thread_idx % THREADS_PER_GROUP;
+
+    // Prefetch: compute page_idx * page_stride + entry_idx * stride_n
+    // Uses parity to select buffer slot, caller must toggle parity after load
+    auto prefetch_kv_offset = [&](int kv_tile_idx, int64_t stride_n, int64_t page_stride,
+                                  bool use_predicate) {
+      int kv_base_idx = kv_tile_idx * CTA_KV;
+
+      int kv_idx_read = kv_base_idx + group_id + thread_in_group * KV_STRIDE;
+      bool valid_read =
+          thread_in_group < NUM_ITERS_PER_GROUP && (!use_predicate || kv_idx_read < kv_len);
+
+      if (valid_read) {
+        // Use divmod to find page and offset within page
+        uint32_t page_iter, entry_idx;
+        mainloop_params.page_size.divmod(kv_idx_read, page_iter, entry_idx);
+        IdType page_idx = kv_indices_ptr[page_iter];
+        // Pre-compute: page_idx * page_stride + entry_idx * stride_n
+        my_kv_offset[parity] = page_idx * page_stride + entry_idx * stride_n;
+      } else {
+        my_kv_offset[parity] = 0;
+      }
     };
 
-    // load last k-tile
+    // Load K/V with pre-computed offsets using shuffle
+    // Uses parity to select buffer slot, caller must toggle parity after load
+    auto load_kv_with_prefetch = [&](DTypeKV* base_ptr, auto& tXsX, int tile_idx, int pipe_idx,
+                                     bool use_predicate) {
+      using Vec = AlignmentTypeKV;
+      constexpr int VecSize = AlignmentKV;
+
+      int kv_base_idx = tile_idx * CTA_KV;
+
+      auto dst = recast<Vec>(flatten(tXsX(_, _, _, pipe_idx)));
+      auto c = flatten(tKVcKV);
+
+      constexpr unsigned FULL_MASK = 0xffffffff;
+
+      // Load using FA3-style shuffle with pre-computed offsets
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(dst); ++i) {
+        auto coord = c(VecSize * i);
+        int kv_offset = get<0>(coord);
+        int d_idx = get<1>(coord);
+        int kv_idx = kv_base_idx + kv_offset;
+        bool guard = !use_predicate || kv_idx < kv_len;
+
+        // Shuffle the pre-computed offset (page_idx * page_stride + entry_idx * stride_n)
+        int src_thread = group_id * THREADS_PER_GROUP + kv_offset / KV_STRIDE;
+        int64_t base_offset = __shfl_sync(FULL_MASK, my_kv_offset[parity], src_thread);
+
+        // Final address: base_ptr + base_offset + d_idx
+        Vec const* src_ptr = reinterpret_cast<Vec const*>(base_ptr + base_offset + d_idx);
+        cutlass::arch::cp_async_zfill<sizeof(Vec), cutlass::arch::CacheOperation::Global>(
+            &dst(i), src_ptr, guard);
+      }
+    };
+
+    int valid_last_kv_tile_size = std::min<int>(kv_len - kv_tile_idx * CTA_KV, CTA_KV);
+
+    // load last k-tile with prefetch optimization
+    // parity=0: prefetch kv_tile_idx -> my_kv_offset[0]
     // all threads are issuing as TMA is disabled
     {
+      prefetch_kv_offset(kv_tile_idx, k_stride_n, k_page_stride, true);
       pipeline_k.producer_acquire(smem_pipe_write);
-      Tensor tKgKiGroup = flatten_1(tKgK(_, _, _, kv_tile_idx));  // (CPY, (CPY_KV, CPY_D))
-      Tensor tKsKiGroup =
-          flatten_1(tKsK(_, _, _, smem_pipe_write.index()));  // (CPY, (CPY_KV, CPY_D))
-      copy_if(gmem_tiled_copy_kv, predicate_fn, tKgKiGroup, tKsKiGroup);
+      load_kv_with_prefetch(k_base_ptr, tKsK, kv_tile_idx, smem_pipe_write.index(), true);
       pipeline_k.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
+      // Note: don't toggle parity here, we reuse the same buffer for V below
     }
 
     // Wait for the MMA warpgroups to say that smem_q is ready
@@ -264,12 +357,9 @@ struct FP8SparseCollectiveMainloop {
     shared_storage.barrier_O.wait((work_idx + 1) % 2);
 
     if (kv_tile_idx == swa_begin_kv_tile_idx) {
-      // first tile is the last tile
+      // first tile is the last tile, reuse kv_tile_idx prefetch for V (parity=0)
       pipeline_v.producer_acquire(smem_pipe_write);
-      Tensor tVgViGroup = flatten_1(tVgV(_, _, _, kv_tile_idx));  // (CPY, (CPY_KV, CPY_D))
-      Tensor tVsViGroup =
-          flatten_1(tVsV(_, _, _, smem_pipe_write.index()));  // (CPY, (CPY_KV, CPY_D))
-      copy_if(gmem_tiled_copy_kv, predicate_fn, tVgViGroup, tVsViGroup);
+      load_kv_with_prefetch(v_base_ptr, tVsV, kv_tile_idx, smem_pipe_write.index(), true);
       pipeline_v.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
 
       // Transpose V
@@ -282,11 +372,15 @@ struct FP8SparseCollectiveMainloop {
       ++smem_pipe_write;  // update state, as K is loaded 1 step faster
     } else {
       // load second last k-tile and last v-tile
+      // parity=0: kv_tile_idx is in my_kv_offset[0]
+      // Now prefetch kv_tile_idx-1 into my_kv_offset[1]
+      parity ^= 1;  // parity=1
+      prefetch_kv_offset(kv_tile_idx - 1, k_stride_n, k_page_stride, false);
+
+      // Load V using prefetch from last K load (kv_tile_idx), use my_kv_offset[0]
+      parity ^= 1;  // parity=0
       pipeline_v.producer_acquire(smem_pipe_write);
-      Tensor tVgViGroup = flatten_1(tVgV(_, _, _, kv_tile_idx));  // (CPY, (CPY_KV, CPY_D))
-      Tensor tVsViGroup =
-          flatten_1(tVsV(_, _, _, smem_pipe_write.index()));  // (CPY, (CPY_KV, CPY_D))
-      copy_if(gmem_tiled_copy_kv, predicate_fn, tVgViGroup, tVsViGroup);
+      load_kv_with_prefetch(v_base_ptr, tVsV, kv_tile_idx, smem_pipe_write.index(), true);
       pipeline_v.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
 
       // Transpose V
@@ -298,21 +392,27 @@ struct FP8SparseCollectiveMainloop {
       ++smem_pipe_read;
       ++smem_pipe_write;  // update state, as K is loaded 1 step faster
 
+      // Load K (kv_tile_idx - 1) using prefetched offset in my_kv_offset[1]
+      parity ^= 1;  // parity=1
       pipeline_k.producer_acquire(smem_pipe_write);
-      Tensor tKgKi = tKgK(_, _, _, kv_tile_idx - 1);          // (CPY, CPY_KV, CPY_D)
-      Tensor tKsKi = tKsK(_, _, _, smem_pipe_write.index());  // (CPY, CPY_KV, CPY_D)
-      copy(gmem_tiled_copy_kv, tKgKi, tKsKi);
+      load_kv_with_prefetch(k_base_ptr, tKsK, kv_tile_idx - 1, smem_pipe_write.index(), false);
       pipeline_k.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
 
       --kv_tile_idx;
+      // Now kv_tile_idx == kv_tile_idx-1, and its offset is in my_kv_offset[1] (parity=1)
 
       // load remaining k/v tiles
 #pragma unroll 2
       for (; kv_tile_idx > swa_begin_kv_tile_idx; --kv_tile_idx) {
+        // parity points to current kv_tile_idx's offset
+        // Prefetch next K tile into the other buffer
+        parity ^= 1;  // Toggle to other buffer for prefetch
+        prefetch_kv_offset(kv_tile_idx - 1, k_stride_n, k_page_stride, false);
+
+        // Load V using prefetch from previous K prefetch, use previous buffer
+        parity ^= 1;  // Toggle back to kv_tile_idx's buffer
         pipeline_v.producer_acquire(smem_pipe_write);
-        Tensor tVgVi = tVgV(_, _, _, kv_tile_idx);              // (CPY, CPY_KV, CPY_D)
-        Tensor tVsVi = tVsV(_, _, _, smem_pipe_write.index());  // (CPY, CPY_KV, CPY_D)
-        copy(gmem_tiled_copy_kv, tVgVi, tVsVi);
+        load_kv_with_prefetch(v_base_ptr, tVsV, kv_tile_idx, smem_pipe_write.index(), false);
         pipeline_v.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
 
         // Transpose V
@@ -324,20 +424,21 @@ struct FP8SparseCollectiveMainloop {
         ++smem_pipe_read;
         ++smem_pipe_write;  // update state, as K is loaded 1 step faster
 
+        // Load K using prefetched offset
+        parity ^= 1;  // Toggle to kv_tile_idx-1's buffer
         pipeline_k.producer_acquire(smem_pipe_write);
-        Tensor tKgKi = tKgK(_, _, _, kv_tile_idx - 1);          // (CPY, CPY_KV, CPY_D)
-        Tensor tKsKi = tKsK(_, _, _, smem_pipe_write.index());  // (CPY, CPY_KV, CPY_D)
-        copy(gmem_tiled_copy_kv, tKgKi, tKsKi);
+        load_kv_with_prefetch(k_base_ptr, tKsK, kv_tile_idx - 1, smem_pipe_write.index(), false);
         pipeline_k.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
+        // After loop update, kv_tile_idx becomes kv_tile_idx-1
+        // parity already points to kv_tile_idx-1's buffer
       }
       scheduler.prefetch_next_work(scheduler_params, work_tile_info);
 
-      // load first v tile
+      // load first v tile (tile 0)
       {
+        prefetch_kv_offset(0, v_stride_n, v_page_stride, false);
         pipeline_v.producer_acquire(smem_pipe_write);
-        Tensor tVgVi = tVgV(_, _, _, 0);                        // (CPY, (CPY_KV, CPY_D))
-        Tensor tVsVi = tVsV(_, _, _, smem_pipe_write.index());  // (CPY, (CPY_KV, CPY_D))
-        copy(gmem_tiled_copy_kv, tVgVi, tVsVi);
+        load_kv_with_prefetch(v_base_ptr, tVsV, 0, smem_pipe_write.index(), false);
         pipeline_v.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
 
         // Transpose V
diff --git a/include/flashinfer/attention/hopper/quantization/prefill_sm90.cuh b/include/flashinfer/attention/hopper/quantization/prefill_sm90.cuh
index 24e416b61b..27733f90f7 100644
--- a/include/flashinfer/attention/hopper/quantization/prefill_sm90.cuh
+++ b/include/flashinfer/attention/hopper/quantization/prefill_sm90.cuh
@@ -334,13 +334,16 @@ cudaError_t BatchFP8PrefillWithPagedKVCacheKernelTraitsDispatched(Params& params
                        params.q_stride_n,
                        params.q_stride_h),  // layout_Q
        params.k_ptr,
-       // NOTE(Zihao): nnz was useless here, we can just pass 0
-       get_gmem_layout(/*nnz=*/0, params.num_kv_heads, KernelTraits::HEAD_DIM, params.k_stride_n,
-                       params.k_stride_h),  // layout_K
+       params.k_stride_n,     // k_stride_n
+       params.k_stride_h,     // k_stride_h
+       params.k_page_stride,  // k_page_stride
        params.v_ptr,
-       get_gmem_layout(/*nnz=*/0, params.num_kv_heads, KernelTraits::HEAD_DIM, params.v_stride_n,
-                       params.v_stride_h),  // layout_V
-       params.kv_indices, params.window_left, params.additional_params});
+       params.v_stride_n,     // v_stride_n
+       params.v_stride_h,     // v_stride_h
+       params.v_page_stride,  // v_page_stride
+       params.kv_indices,
+       static_cast<uint32_t>(params.page_size),  // page_size
+       params.window_left, params.additional_params});
   typename CollectiveEpilogue::Params epilogue_params =
       CollectiveEpilogue::to_underlying_arguments({
           params.o_ptr,
@@ -458,9 +461,127 @@ cudaError_t BatchFP8PrefillWithPagedKVCacheDispatched(Params& params, bool enabl
         LEFT_SLIDING_WINDOW, CAUSAL, SAME_SCHEDULE_FOR_ALL_HEADS>(params, stream);
   } else {
     // HEAD_DIM == 256;
-    // NOTE(Zihao): CTA_KV not tuned for HEAD_DIM == 256, need to optimize later
+    // NOTE: Use smaller CTA_KV=64 for sparse paged loading to reduce page table lookup overhead
+    // (FP8 transpose requires minimum 64x64 blocks, so CTA_KV cannot be smaller than 64)
     BatchFP8PrefillWithPagedKVCacheKernelTraitsDispatched<
         FP8AttentionKernelTraits</*USE_TMA_LOAD_KV=*/false, HEAD_DIM,
+                                 /*CTA_Q_=*/128,
+                                 /*CTA_KV_=*/64,
+                                 /*NUM_STAGES_=*/2, typename Params::DTypeQ,
+                                 typename Params::DTypeKV, typename Params::DTypeO,
+                                 typename Params::IdType, AttentionVariant>,
+        LEFT_SLIDING_WINDOW, CAUSAL, SAME_SCHEDULE_FOR_ALL_HEADS>(params, stream);
+  }
+  cudaError_t status = cudaGetLastError();
+  return status;
+};
+
+template <typename KernelTraits, bool LEFT_SLIDING_WINDOW, bool CAUSAL,
+          bool SAME_SCHEDULE_FOR_ALL_HEADS, typename Params>
+cudaError_t BatchFP8PrefillWithRaggedKVCacheKernelTraitsDispatched(Params& params,
+                                                                   cudaStream_t stream) {
+  using DTypeQ = typename KernelTraits::DTypeQ;
+  using DTypeKV = typename KernelTraits::DTypeKV;
+  using DTypeO = typename KernelTraits::DTypeO;
+  using IdType = typename KernelTraits::IdType;
+
+  using CollectiveMainloop =
+      FP8CollectiveMainloop<typename Params::AdditionalParams, KernelTraits, CAUSAL>;
+  using CollectiveEpilogue = FP8CollectiveEpilogue<KernelTraits>;
+  using Scheduler =
+      std::conditional_t<SAME_SCHEDULE_FOR_ALL_HEADS, BatchPrefillTileScheduler<IdType>,
+                         BatchPrefillPersistentTileScheduler<IdType>>;
+  typename CollectiveMainloop::Params mainloop_params = CollectiveMainloop::to_underlying_arguments(
+      {params.q_ptr,
+       get_gmem_layout(params.nnz_qo, params.num_qo_heads, KernelTraits::HEAD_DIM,
+                       params.q_stride_n,
+                       params.q_stride_h),  // layout_Q
+       params.k_ptr,
+       // NOTE(Zihao): nnz was useless here, we can just pass 0
+       get_gmem_layout(params.nnz_kv, params.num_kv_heads, KernelTraits::HEAD_DIM,
+                       params.k_stride_n,
+                       params.k_stride_h),  // layout_K
+       params.v_ptr,
+       get_gmem_layout(params.nnz_kv, params.num_kv_heads, KernelTraits::HEAD_DIM,
+                       params.v_stride_n,
+                       params.v_stride_h),  // layout_V
+       params.window_left, params.additional_params});
+  typename CollectiveEpilogue::Params epilogue_params =
+      CollectiveEpilogue::to_underlying_arguments({
+          params.o_ptr,
+          get_gmem_layout(params.nnz_qo, params.num_qo_heads, KernelTraits::HEAD_DIM,
+                          params.o_stride_n,
+                          params.o_stride_h),                                       // layout_O
+          params.lse_ptr, get_lse_gmem_layout(params.nnz_qo, params.num_qo_heads),  // layout_LSE
+      });
+
+  // NOTE(Zihao): add support for kv head-major later
+  typename Scheduler::Arguments scheduler_args = {
+      params.work_indptr,
+      params.head_indices,
+      params.qo_tile_indices,
+      params.qo_indptr,
+      params.kv_indptr,
+      params.qo_lens,
+      params.kv_lens,
+      params.batch_indices,
+      cutlass::FastDivmod(params.num_qo_heads / params.num_kv_heads),
+      params.num_qo_heads};
+  typename Scheduler::Params scheduler_params = Scheduler::to_underlying_arguments(scheduler_args);
+
+  // Get the ptr to kernel function.
+  auto kernel =
+      (void*)FP8PrefillWithKVCacheKernel<CollectiveMainloop, CollectiveEpilogue, KernelTraits,
+                                         LEFT_SLIDING_WINDOW, CAUSAL, Scheduler>;
+  int smem_size = sizeof(typename KernelTraits::SharedStorage);
+  FLASHINFER_CUDA_CALL(
+      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+
+  int device;
+  cudaGetDevice(&device);
+  int multiprocessor_count;
+  FLASHINFER_CUDA_CALL(
+      cudaDeviceGetAttribute(&multiprocessor_count, cudaDevAttrMultiProcessorCount, device));
+  dim3 grid_dims = Scheduler::get_grid_dim(scheduler_args, multiprocessor_count);
+  static constexpr int ctaSize = KernelTraits::NUM_WARPS * 32;
+  dim3 block_dims(ctaSize);
+  void* args[] = {&mainloop_params, &epilogue_params, &scheduler_params};
+  FLASHINFER_CUDA_CALL(cudaLaunchKernel(kernel, grid_dims, block_dims, args, smem_size, stream));
+
+  return cudaSuccess;
+}
+
+template <uint32_t HEAD_DIM, MaskMode MASK_MODE, bool LEFT_SLIDING_WINDOW,
+          bool SAME_SCHEDULE_FOR_ALL_HEADS, typename AttentionVariant, typename Params>
+cudaError_t BatchFP8PrefillWithRaggedKVCacheDispatched(Params& params, bool enable_pdl,
+                                                       cudaStream_t stream) {
+  static_assert(HEAD_DIM == 64 || HEAD_DIM == 128 || HEAD_DIM == 256);
+  if (MASK_MODE == MaskMode::kCustom) {
+    return cudaErrorNotSupported;  // Not supported yet.
+  }
+  constexpr bool CAUSAL = MASK_MODE == MaskMode::kCausal;
+  if constexpr (HEAD_DIM == 64) {
+    BatchFP8PrefillWithRaggedKVCacheKernelTraitsDispatched<
+        FP8AttentionKernelTraits</*USE_TMA_LOAD_KV=*/true, HEAD_DIM,
+                                 /*CTA_Q_=*/192,
+                                 /*CTA_KV_=*/128,
+                                 /*NUM_STAGES_=*/4, typename Params::DTypeQ,
+                                 typename Params::DTypeKV, typename Params::DTypeO,
+                                 typename Params::IdType, AttentionVariant>,
+        LEFT_SLIDING_WINDOW, CAUSAL, SAME_SCHEDULE_FOR_ALL_HEADS>(params, stream);
+  } else if constexpr (HEAD_DIM == 128) {
+    BatchFP8PrefillWithRaggedKVCacheKernelTraitsDispatched<
+        FP8AttentionKernelTraits</*USE_TMA_LOAD_KV=*/true, HEAD_DIM,
+                                 /*CTA_Q_=*/128,
+                                 /*CTA_KV_=*/192,
+                                 /*NUM_STAGES_=*/2, typename Params::DTypeQ,
+                                 typename Params::DTypeKV, typename Params::DTypeO,
+                                 typename Params::IdType, AttentionVariant>,
+        LEFT_SLIDING_WINDOW, CAUSAL, SAME_SCHEDULE_FOR_ALL_HEADS>(params, stream);
+  } else {
+    // HEAD_DIM == 256;
+    BatchFP8PrefillWithRaggedKVCacheKernelTraitsDispatched<
+        FP8AttentionKernelTraits</*USE_TMA_LOAD_KV=*/true, HEAD_DIM,
                                  /*CTA_Q_=*/128,
                                  /*CTA_KV_=*/128,
                                  /*NUM_STAGES_=*/2, typename Params::DTypeQ,
diff --git a/include/flashinfer/attention/hopper/sparse_mainloop.cuh b/include/flashinfer/attention/hopper/sparse_mainloop.cuh
index cc713cdfb2..1d9697af4e 100644
--- a/include/flashinfer/attention/hopper/sparse_mainloop.cuh
+++ b/include/flashinfer/attention/hopper/sparse_mainloop.cuh
@@ -21,8 +21,8 @@
 #include <cutlass/numeric_conversion.h>
 #include <cutlass/numeric_types.h>
 
+#include "../../fastdiv.cuh"
 #include "../../math.cuh"
-#include "block_sparse_gather.cuh"
 #include "cute/tensor.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 #include "cutlass/pipeline/pipeline.hpp"
@@ -86,6 +86,9 @@ struct SparseCollectiveMainloop {
 
   static constexpr bool USE_TMA_LOAD_KV = false;
   static constexpr int NUM_MMA_THREADS = size(typename Ktraits::TiledMmaQK{});
+  // Verify NUM_PRODUCER_THREADS matches NUM_COPY_THREADS for sparse loading
+  static_assert(Ktraits::NUM_PRODUCER_THREADS == NUM_COPY_THREADS,
+                "NUM_PRODUCER_THREADS must equal NUM_COPY_THREADS for sparse/paged KV loading");
   using MainloopPipeline = typename Ktraits::MainloopPipeline;
   using PipelineParams = typename MainloopPipeline::Params;
   using PipelineState = typename MainloopPipeline::PipelineState;
@@ -107,6 +110,9 @@ struct SparseCollectiveMainloop {
     LayoutT layout_V;
     IdType const* kv_indices;
     int window_left;
+    int64_t k_page_stride;  // Stride between pages for K (paged_k.stride(0))
+    int64_t v_page_stride;  // Stride between pages for V (paged_v.stride(0))
+    uint32_t page_size;     // Size of each page
     AdditionalParams additional_params;
   };
 
@@ -120,6 +126,9 @@ struct SparseCollectiveMainloop {
     DTypeKV* V_ptr;
     IdType* kv_indices;
     int window_left;
+    int64_t k_page_stride;   // Stride between pages for K
+    int64_t v_page_stride;   // Stride between pages for V
+    uint_fastdiv page_size;  // Size of each page (as fastdiv for efficient divmod)
     AdditionalParams additional_params;
   };
 
@@ -135,6 +144,9 @@ struct SparseCollectiveMainloop {
             const_cast<DTypeKV*>(args.V_ptr),
             const_cast<IdType*>(args.kv_indices),
             args.window_left,
+            args.k_page_stride,            // Use stride from arguments
+            args.v_page_stride,            // Use stride from arguments
+            uint_fastdiv(args.page_size),  // Convert page_size to fastdiv
             args.additional_params};
   }
 
@@ -203,45 +215,47 @@ struct SparseCollectiveMainloop {
     constexpr int HEAD_DIM_QK = get<2>(TileShape_QKD{});
     constexpr int HEAD_DIM_VO = get<1>(TileShape_PDV{});
     constexpr int CTA_KV = get<1>(TileShape_QKD{});
-    auto indexed_gather = BlockSparseIndexedGather<IdType>(mainloop_params.kv_indices + kv_indptr);
-
-    Tensor mK = make_block_sparse_tensor(  // (kv_len, D_K)
-        make_gmem_ptr(mainloop_params.K_ptr + kv_head_idx * stride<2>(mainloop_params.layout_K)),
-        make_shape(kv_len, HEAD_DIM_QK), stride<0>(mainloop_params.layout_K), indexed_gather);
-    Tensor mV = make_block_sparse_tensor(  // (kv_len, D_V)
-        make_gmem_ptr(mainloop_params.V_ptr + kv_head_idx * stride<2>(mainloop_params.layout_V)),
-        make_shape(kv_len, HEAD_DIM_VO), stride<0>(mainloop_params.layout_V), indexed_gather);
 
+    // Store base pointers and indices for manual page table lookup
+    DTypeKV* K_ptr_base = mainloop_params.K_ptr + kv_head_idx * stride<2>(mainloop_params.layout_K);
+    DTypeKV* V_ptr_base = mainloop_params.V_ptr + kv_head_idx * stride<2>(mainloop_params.layout_V);
+    IdType const* kv_indices_ptr = mainloop_params.kv_indices + kv_indptr;
+    // Use the page stride (stride between pages) and stride within page
+    int64_t k_page_stride = mainloop_params.k_page_stride;
+    int64_t v_page_stride = mainloop_params.v_page_stride;
+    int64_t k_stride_n =
+        stride<0>(mainloop_params.layout_K);  // Stride within page (between tokens)
+    int64_t v_stride_n = stride<0>(mainloop_params.layout_V);
+
+    // Create dummy tensors for partitioning with contiguous column-major layout
+    // NOTE: We use a virtual contiguous layout for correct partitioning,
+    // actual addressing uses page table lookup
     Tensor gK =
-        local_tile(mK, select<1, 2>(TileShape_QKD{}), make_coord(_, _0{}));  // (KV, D_K, kv)
+        make_tensor(make_gmem_ptr(static_cast<DTypeKV*>(nullptr)), make_shape(CTA_KV, HEAD_DIM_QK),
+                    make_stride(HEAD_DIM_QK, _1{}));  // Column-major: (KV, D)
+    Tensor gK_tiled =
+        local_tile(gK, select<1, 2>(TileShape_QKD{}), make_coord(_, _0{}));  // (KV, D_K, kv)
     Tensor gV =
-        local_tile(mV, select<2, 1>(TileShape_PDV{}), make_coord(_, _0{}));  // (KV, D_V, kv)
-    Tensor cK = cute::make_identity_tensor(gK.shape());
-    Tensor cV = cute::make_identity_tensor(gV.shape());
+        make_tensor(make_gmem_ptr(static_cast<DTypeKV*>(nullptr)), make_shape(CTA_KV, HEAD_DIM_VO),
+                    make_stride(HEAD_DIM_VO, _1{}));  // Column-major: (KV, D)
+    Tensor gV_tiled =
+        local_tile(gV, select<2, 1>(TileShape_PDV{}), make_coord(_, _0{}));  // (KV, D_V, kv)
+    Tensor cK = cute::make_identity_tensor(gK_tiled.shape());
+    Tensor cV = cute::make_identity_tensor(gV_tiled.shape());
 
     GmemTiledCopyK gmem_tiled_copy_k;
     GmemTiledCopyV gmem_tiled_copy_v;
     auto gmem_thr_copy_k = gmem_tiled_copy_k.get_slice(thread_idx);
     auto gmem_thr_copy_v = gmem_tiled_copy_v.get_slice(thread_idx);
 
-    Tensor tKgK = gmem_thr_copy_k.partition_S(gK);  // (CPY, CPY_KV, CPY_D, kv)
-    Tensor tKsK = gmem_thr_copy_k.partition_D(sK);  // (CPY, CPY_KV, CPY_D, PIPE)
-    Tensor tVgV = gmem_thr_copy_v.partition_S(gV);  // (CPY, CPY_KV, CPY_D, kv)
-    Tensor tVsV = gmem_thr_copy_v.partition_D(sV);  // (CPY, CPY_KV, CPY_D, PIPE)
-    Tensor tKcK = gmem_thr_copy_k.partition_D(cK);  // (CPY, CPY_KV, CPY_D)
-    Tensor tKcKGroup = flatten_1(tKcK);             // (CPY, (CPY_KV, CPY_D))
-    Tensor tVcV = gmem_thr_copy_v.partition_D(cV);  // (CPY, CPY_KV, CPY_D)
-    Tensor tVcVGroup = flatten_1(tVcV);             // (CPY, (CPY_KV, CPY_D))
+    Tensor tKgK = gmem_thr_copy_k.partition_S(gK_tiled);  // (CPY, CPY_KV, CPY_D, kv)
+    Tensor tKsK = gmem_thr_copy_k.partition_D(sK);        // (CPY, CPY_KV, CPY_D, PIPE)
+    Tensor tVgV = gmem_thr_copy_v.partition_S(gV_tiled);  // (CPY, CPY_KV, CPY_D, kv)
+    Tensor tVsV = gmem_thr_copy_v.partition_D(sV);        // (CPY, CPY_KV, CPY_D, PIPE)
+    Tensor tKcK = gmem_thr_copy_k.partition_D(cK);        // (CPY, CPY_KV, CPY_D, kv)
+    Tensor tVcV = gmem_thr_copy_v.partition_D(cV);        // (CPY, CPY_KV, CPY_D, kv)
 
     int valid_last_kv_tile_size = std::min<int>(kv_len - kv_tile_idx * CTA_KV, CTA_KV);
-    auto k_predicate_fn = [&](auto coords) {
-      auto s_coords = tKcKGroup(_0{}, coords);
-      return elem_less(get<0>(s_coords), valid_last_kv_tile_size);
-    };
-    auto v_predicate_fn = [&](auto coords) {
-      auto s_coords = tVcVGroup(_0{}, coords);
-      return elem_less(get<0>(s_coords), valid_last_kv_tile_size);
-    };
     auto kv_tile_idx_decrement = [&](int kv_tile_idx) {
       int result = kv_tile_idx - 1;
       if constexpr (MULTIITEMSCORING) {
@@ -253,23 +267,94 @@ struct SparseCollectiveMainloop {
       return result;
     };
 
+    // FA3-style cooperative loading: store pre-computed base offset for each KV position
+    int64_t my_kv_offset[2];  // Rolling buffer: page_idx * page_stride + entry_idx * stride_n
+    int parity = 0;           // Buffer parity for double buffering, toggled with ^= 1
+
+    // Group organization based on partition strategy
+    constexpr int NUM_KV_PER_ITER = decltype(size<1>(tKcK))::value;   // e.g., 12
+    constexpr int KV_STRIDE = CTA_KV / NUM_KV_PER_ITER;               // 96/12 = 8
+    constexpr int NUM_GROUPS = KV_STRIDE;                             // 8 groups (one per lane)
+    constexpr int THREADS_PER_GROUP = NUM_COPY_THREADS / NUM_GROUPS;  // 128/8 = 16
+    constexpr int NUM_ITERS_PER_GROUP = NUM_KV_PER_ITER;              // 12 iterations per group
+
+    int group_id = thread_idx / THREADS_PER_GROUP;         // 0-7
+    int thread_in_group = thread_idx % THREADS_PER_GROUP;  // 0-15
+
+    // Prefetch: compute page_idx * page_stride + entry_idx * stride_n
+    // NOTE: Assumes K and V have same strides (asserted on host side)
+    // Uses parity to select buffer slot, caller must toggle parity after load
+    auto prefetch_kv_offset = [&](int kv_tile_idx, bool use_predicate) {
+      int kv_base_idx = kv_tile_idx * CTA_KV;
+
+      int kv_idx_read = kv_base_idx + group_id + thread_in_group * KV_STRIDE;
+      bool valid_read =
+          thread_in_group < NUM_ITERS_PER_GROUP && (!use_predicate || kv_idx_read < kv_len);
+
+      if (valid_read) {
+        // Use divmod to find page and offset within page
+        uint32_t page_iter, entry_idx;
+        mainloop_params.page_size.divmod(kv_idx_read, page_iter, entry_idx);
+        IdType page_idx = kv_indices_ptr[page_iter];
+        // Pre-compute: page_idx * page_stride + entry_idx * stride_n
+        my_kv_offset[parity] = page_idx * k_page_stride + entry_idx * k_stride_n;
+      } else {
+        my_kv_offset[parity] = 0;
+      }
+    };
+
+    // Unified helper lambda to load K or V with pre-computed offsets
+    // Uses parity to select buffer slot, caller must toggle parity after load
+    auto load_kv_with_gather = [&](auto&& tXsX, auto&& tXcX, DTypeKV* base_ptr, int kv_tile_idx,
+                                   int stage_idx, bool use_predicate) {
+      using Vec = AlignmentTypeKV;
+      constexpr int VecSize = sizeof(Vec) / sizeof(DTypeKV);
+
+      int kv_base_idx = kv_tile_idx * CTA_KV;
+
+      auto dst = recast<Vec>(flatten(tXsX(_, _, _, stage_idx)));
+      auto c = flatten(tXcX(_, _, _, kv_tile_idx));
+
+      constexpr unsigned FULL_MASK = 0xffffffff;
+
+      // Load using FA3-style shuffle with pre-computed offsets
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(dst); ++i) {
+        auto coord = c(VecSize * i);
+        int kv_offset = get<0>(coord);
+        int d_idx = get<1>(coord);
+        int kv_idx = kv_base_idx + kv_offset;
+        bool guard = !use_predicate || kv_idx < kv_len;
+
+        // Shuffle the pre-computed offset (page_idx * page_stride + entry_idx * stride_n)
+        int src_thread = group_id * THREADS_PER_GROUP + kv_offset / KV_STRIDE;
+        int64_t base_offset = __shfl_sync(FULL_MASK, my_kv_offset[parity], src_thread);
+
+        // Final address: base_ptr + base_offset + d_idx
+        // where base_offset = page_idx * page_stride + entry_idx * stride_n
+        Vec const* src_ptr = reinterpret_cast<Vec const*>(base_ptr + base_offset + d_idx);
+        cutlass::arch::cp_async_zfill<sizeof(Vec), cutlass::arch::CacheOperation::Global>(
+            &dst(i), src_ptr, guard);
+      }
+    };
+
     // load last k-tile
+    // parity=0: prefetch kv_tile_idx -> my_kv_offset[0]
     {
+      prefetch_kv_offset(kv_tile_idx, true);
       pipeline_k.producer_acquire(smem_pipe_write_k);
-      Tensor tKgKiGroup = flatten_1(tKgK(_, _, _, kv_tile_idx));  // (CPY, (CPY_KV, CPY_D))
-      Tensor tKsKiGroup =
-          flatten_1(tKsK(_, _, _, smem_pipe_write_k.index()));  // (CPY, (CPY_KV, CPY_D))
-      copy_if(gmem_tiled_copy_k, k_predicate_fn, tKgKiGroup, tKsKiGroup);
-
+      load_kv_with_gather(tKsK, tKcK, K_ptr_base, kv_tile_idx, smem_pipe_write_k.index(), true);
       pipeline_k.producer_commit(smem_pipe_write_k, cutlass::arch::cpasync_barrier_arrive);
       ++smem_pipe_write_k;
+      // Note: don't toggle parity here, we reuse the same buffer for V below
     }
 
-    // load Q tile
-    if (warp_idx_in_warpgroup == 0) {
-      cutlass::arch::NamedBarrier::sync(NUM_MMA_THREADS + cutlass::NumThreadsPerWarp,
-                                        static_cast<int>(NamedBarriers::kQueryEmpty));
+    // All producer threads sync on kQueryEmpty barrier before loading Q
+    cutlass::arch::NamedBarrier::sync(NUM_MMA_THREADS + Ktraits::NUM_PRODUCER_THREADS,
+                                      static_cast<int>(NamedBarriers::kQueryEmpty));
 
+    // load Q tile (only warp 0 issues TMA)
+    if (warp_idx_in_warpgroup == 0) {
       int lane_predicate = cute::elect_one_sync();
       if (lane_predicate) {
         shared_storage.barrier_Q.arrive_and_expect_tx(TmaTransactionBytesQ);
@@ -284,63 +369,64 @@ struct SparseCollectiveMainloop {
     shared_storage.barrier_O.wait((work_idx + 1) % 2);
 
     if (kv_tile_idx == swa_begin_kv_tile_idx) {
+      // kv_tile_idx already prefetched above (parity=0), reuse it for V
       pipeline_v.producer_acquire(smem_pipe_write_v);
-      Tensor tVgViGroup = flatten_1(tVgV(_, _, _, kv_tile_idx));  // (CPY, (CPY_KV, CPY_D))
-      Tensor tVsViGroup =
-          flatten_1(tVsV(_, _, _, smem_pipe_write_v.index()));  // (CPY, (CPY_KV, CPY_D))
-      copy_if(gmem_tiled_copy_v, v_predicate_fn, tVgViGroup, tVsViGroup);
-
+      load_kv_with_gather(tVsV, tVcV, V_ptr_base, kv_tile_idx, smem_pipe_write_v.index(), true);
       pipeline_v.producer_commit(smem_pipe_write_v, cutlass::arch::cpasync_barrier_arrive);
       ++smem_pipe_write_v;
     } else {
       // load second last k-tile and last v-tile
+      // parity=0: kv_tile_idx is in my_kv_offset[0]
+      // Now prefetch kv_tile_k into my_kv_offset[1]
+      int kv_tile_k = kv_tile_idx_decrement(kv_tile_idx);
+      parity ^= 1;  // parity=1
+      prefetch_kv_offset(kv_tile_k, false);
       pipeline_k.producer_acquire(smem_pipe_write_k);
-      Tensor tKgKi = tKgK(_, _, _, kv_tile_idx_decrement(kv_tile_idx));  // (CPY, CPY_KV, CPY_D)
-      Tensor tKsKi = tKsK(_, _, _, smem_pipe_write_k.index());           // (CPY, CPY_KV, CPY_D)
-      copy(gmem_tiled_copy_k, tKgKi, tKsKi);
-
+      load_kv_with_gather(tKsK, tKcK, K_ptr_base, kv_tile_k, smem_pipe_write_k.index(), false);
       pipeline_k.producer_commit(smem_pipe_write_k, cutlass::arch::cpasync_barrier_arrive);
       ++smem_pipe_write_k;
 
+      // Load V for kv_tile_idx using my_kv_offset[0]
+      parity ^= 1;  // parity=0
       pipeline_v.producer_acquire(smem_pipe_write_v);
-      Tensor tVgViGroup = flatten_1(tVgV(_, _, _, kv_tile_idx));  // (CPY, (CPY_KV, CPY_D))
-      Tensor tVsViGroup =
-          flatten_1(tVsV(_, _, _, smem_pipe_write_v.index()));  // (CPY, (CPY_KV, CPY_D))
-      copy_if(gmem_tiled_copy_v, v_predicate_fn, tVgViGroup, tVsViGroup);
-
+      load_kv_with_gather(tVsV, tVcV, V_ptr_base, kv_tile_idx, smem_pipe_write_v.index(), true);
       pipeline_v.producer_commit(smem_pipe_write_v, cutlass::arch::cpasync_barrier_arrive);
       kv_tile_idx = kv_tile_idx_decrement(kv_tile_idx);
       ++smem_pipe_write_v;
+      // Now kv_tile_idx == kv_tile_k, and its offset is in my_kv_offset[1]
+      parity ^= 1;  // parity=1, pointing to kv_tile_idx's offset
 
       // load remaining k/v tiles
 #pragma unroll 2
       for (; kv_tile_idx > swa_begin_kv_tile_idx;
            kv_tile_idx = kv_tile_idx_decrement(kv_tile_idx)) {
+        // parity points to current kv_tile_idx's offset
+        // Prefetch next K tile into the other buffer
+        int kv_tile_k = kv_tile_idx_decrement(kv_tile_idx);
+        parity ^= 1;  // Toggle to other buffer for prefetch
+        prefetch_kv_offset(kv_tile_k, false);
         pipeline_k.producer_acquire(smem_pipe_write_k);
-
-        Tensor tKgKi = tKgK(_, _, _, kv_tile_idx_decrement(kv_tile_idx));  // (CPY, CPY_KV, CPY_D)
-        Tensor tKsKi = tKsK(_, _, _, smem_pipe_write_k.index());           // (CPY, CPY_KV, CPY_D)
-        copy(gmem_tiled_copy_k, tKgKi, tKsKi);
-
+        load_kv_with_gather(tKsK, tKcK, K_ptr_base, kv_tile_k, smem_pipe_write_k.index(), false);
         pipeline_k.producer_commit(smem_pipe_write_k, cutlass::arch::cpasync_barrier_arrive);
         ++smem_pipe_write_k;
 
+        // Load V for kv_tile_idx using the previous buffer
+        parity ^= 1;  // Toggle back to kv_tile_idx's buffer
         pipeline_v.producer_acquire(smem_pipe_write_v);
-        Tensor tVgVi = tVgV(_, _, _, kv_tile_idx);                // (CPY, CPY_KV, CPY_D)
-        Tensor tVsVi = tVsV(_, _, _, smem_pipe_write_v.index());  // (CPY, CPY_KV, CPY_D)
-        copy(gmem_tiled_copy_v, tVgVi, tVsVi);
-
+        load_kv_with_gather(tVsV, tVcV, V_ptr_base, kv_tile_idx, smem_pipe_write_v.index(), false);
         pipeline_v.producer_commit(smem_pipe_write_v, cutlass::arch::cpasync_barrier_arrive);
         ++smem_pipe_write_v;
+        // After loop update, kv_tile_idx becomes kv_tile_k
+        // Toggle parity to point to kv_tile_k's buffer for next iteration
+        parity ^= 1;
       }
       scheduler.prefetch_next_work(scheduler_params, work_tile_info);
 
-      // load first v tile
+      // load first v tile (tile 0)
       {
+        prefetch_kv_offset(0, false);
         pipeline_v.producer_acquire(smem_pipe_write_v);
-        Tensor tVgVi = tVgV(_, _, _, 0);                          // (CPY, (CPY_KV, CPY_D))
-        Tensor tVsVi = tVsV(_, _, _, smem_pipe_write_v.index());  // (CPY, (CPY_KV, CPY_D))
-        copy(gmem_tiled_copy_v, tVgVi, tVsVi);
+        load_kv_with_gather(tVsV, tVcV, V_ptr_base, 0, smem_pipe_write_v.index(), false);
         pipeline_v.producer_commit(smem_pipe_write_v, cutlass::arch::cpasync_barrier_arrive);
         ++smem_pipe_write_v;
       }
diff --git a/include/flashinfer/attention/hopper/variants.cuh b/include/flashinfer/attention/hopper/variants.cuh
index 1a199e2d30..ca5deda07a 100644
--- a/include/flashinfer/attention/hopper/variants.cuh
+++ b/include/flashinfer/attention/hopper/variants.cuh
@@ -19,17 +19,77 @@
 #include <cuda_runtime.h>
 
 #include "../../math.cuh"
+#include "../../utils.cuh"
 #include "attention_updater.cuh"
 #include "variant_helper.cuh"
 
 namespace flashinfer {
 
+// SFINAE to detect maybe_scale_v and scale_v_scalar members
+DEFINE_HAS_MEMBER(maybe_scale_v)
+DEFINE_HAS_MEMBER(scale_v_scalar)
+DEFINE_HAS_MEMBER(maybe_scale_q)
+DEFINE_HAS_MEMBER(scale_q_scalar)
+DEFINE_HAS_MEMBER(maybe_scale_k)
+DEFINE_HAS_MEMBER(scale_k_scalar)
+DEFINE_HAS_MEMBER(scale_pv)
+
+// Helper to get scale value from tensor pointer or scalar fallback
+template <typename T>
+__device__ __forceinline__ float get_scale(const T* tensor_ptr, float scalar_val, uint32_t idx) {
+  return tensor_ptr != nullptr ? static_cast<float>(tensor_ptr[idx]) : scalar_val;
+}
+
+// Helper to get v_scale from additional_params (returns 1.0 if fields don't exist)
+template <typename AdditionalParams>
+__device__ __forceinline__ float get_v_scale(const AdditionalParams& params, uint32_t kv_head_idx) {
+  if constexpr (has_maybe_scale_v_v<AdditionalParams> && has_scale_v_scalar_v<AdditionalParams>) {
+    return get_scale(params.maybe_scale_v, params.scale_v_scalar, kv_head_idx);
+  } else {
+    return 1.0f;
+  }
+}
+
+// Helper to get q_scale from additional_params (returns 1.0 if fields don't exist)
+template <typename AdditionalParams>
+__device__ __forceinline__ float get_q_scale(const AdditionalParams& params, uint32_t qo_head_idx) {
+  if constexpr (has_maybe_scale_q_v<AdditionalParams> && has_scale_q_scalar_v<AdditionalParams>) {
+    return get_scale(params.maybe_scale_q, params.scale_q_scalar, qo_head_idx);
+  } else {
+    return 1.0f;
+  }
+}
+
+// Helper to get k_scale from additional_params (returns 1.0 if fields don't exist)
+template <typename AdditionalParams>
+__device__ __forceinline__ float get_k_scale(const AdditionalParams& params, uint32_t kv_head_idx) {
+  if constexpr (has_maybe_scale_k_v<AdditionalParams> && has_scale_k_scalar_v<AdditionalParams>) {
+    return get_scale(params.maybe_scale_k, params.scale_k_scalar, kv_head_idx);
+  } else {
+    return 1.0f;
+  }
+}
+
+// Helper to get scale_pv from attention variant (returns 1.0 if field doesn't exist)
+template <typename AttentionVariant>
+__device__ __forceinline__ float get_variant_scale_pv(const AttentionVariant& variant) {
+  if constexpr (has_scale_pv_v<AttentionVariant>) {
+    return variant.scale_pv;
+  } else {
+    return 1.0f;
+  }
+}
+
 struct StandardAttention {
   float sm_scale_log2;
+  float scale_pv;  // v_scale for non-FP8
 
   template <typename MainloopParams, typename BlockCoord>
   __device__ StandardAttention(const MainloopParams& params, const BlockCoord& block_coord) {
+    auto [q_tile_idx, qo_head_idx, kv_head_idx, qo_indptr, kv_indptr, qo_len, kv_len, batch_idx] =
+        block_coord;
     sm_scale_log2 = params.additional_params.sm_scale * math::log2e;
+    scale_pv = get_v_scale(params.additional_params, kv_head_idx);
   }
 
   template <int NUM_ROWS_PER_THREAD>
@@ -44,12 +104,16 @@ struct StandardAttention {
 struct LogitsSoftCap {
   float pre_tanh_scale;
   float post_tanh_scale;
+  float scale_pv;  // v_scale for non-FP8
 
   template <typename MainloopParams, typename BlockCoord>
   __device__ LogitsSoftCap(const MainloopParams& params, const BlockCoord& block_coord) {
+    auto [q_tile_idx, qo_head_idx, kv_head_idx, qo_indptr, kv_indptr, qo_len, kv_len, batch_idx] =
+        block_coord;
     pre_tanh_scale =
         params.additional_params.sm_scale * math::ptx_rcp(params.additional_params.logits_soft_cap);
     post_tanh_scale = math::log2e * params.additional_params.logits_soft_cap;
+    scale_pv = get_v_scale(params.additional_params, kv_head_idx);
   }
 
   template <int NUM_ROWS_PER_THREAD>
@@ -70,10 +134,11 @@ struct StandardFP8Attention {
         block_coord;
     // 448 for e4m3; 57344 for e5m2
     p_scale = std::numeric_limits<typename MainloopParams::DTypeKV>::max();
-    scale_pv = params.additional_params.scale_v[kv_head_idx] / p_scale;
-    sm_scale_with_qk_log2 = params.additional_params.scale_q[qo_head_idx] *
-                            params.additional_params.scale_k[kv_head_idx] *
-                            params.additional_params.sm_scale * math::log2e;
+    float v_scale = get_v_scale(params.additional_params, kv_head_idx);
+    scale_pv = v_scale / p_scale;
+    float q_scale = get_q_scale(params.additional_params, qo_head_idx);
+    float k_scale = get_k_scale(params.additional_params, kv_head_idx);
+    sm_scale_with_qk_log2 = q_scale * k_scale * params.additional_params.sm_scale * math::log2e;
   }
 
   template <int NUM_ROWS_PER_THREAD>
diff --git a/include/flashinfer/attention/mla.cuh b/include/flashinfer/attention/mla.cuh
index 31401ff1c5..9882cd027a 100644
--- a/include/flashinfer/attention/mla.cuh
+++ b/include/flashinfer/attention/mla.cuh
@@ -628,7 +628,8 @@ __device__ void DevicePersistentMergeStates(
     typename KTraits::IdType* merge_partial_packed_offset_end,
     typename KTraits::IdType* merge_partial_stride, typename KTraits::DTypeO* partial_o,
     float* partial_lse, typename KTraits::DTypeO* final_o, float* final_lse,
-    const uint32_t o_stride_n, const uint32_t o_stride_h, const uint_fastdiv& num_heads) {
+    const uint32_t o_stride_n, const uint32_t o_stride_h, const uint_fastdiv& num_heads,
+    const bool& return_lse_base_on_e) {
   constexpr uint32_t VEC_SIZE = 8;  // partial o has data type float
   constexpr uint32_t NUM_THRS_PER_ROW = KTraits::HEAD_DIM_CKV / VEC_SIZE;
   constexpr uint32_t ROWS_PER_ITERATION = (KTraits::NUM_THREADS) / NUM_THRS_PER_ROW;
@@ -661,6 +662,9 @@ __device__ void DevicePersistentMergeStates(
                     (q * o_stride_n + r * o_stride_h + (thread_id % NUM_THRS_PER_ROW) * VEC_SIZE));
     if (final_lse) {
       final_lse[q * num_heads + r] = st.get_lse();
+      if (return_lse_base_on_e) {
+        final_lse[q * num_heads + r] *= math::loge2;
+      }
     }
   }
 }
@@ -672,8 +676,8 @@ __device__ __forceinline__ void write_o(typename KTraits::SharedStorage* smem_st
                                         float (*o_frag)[8], typename KTraits::DTypeQKAccum* m,
                                         float* d, const uint32_t o_stride_n,
                                         const uint32_t o_stride_h, const uint32_t q_len,
-                                        const uint32_t packed_offset,
-                                        const uint_fastdiv& num_heads) {
+                                        const uint32_t packed_offset, const uint_fastdiv& num_heads,
+                                        const bool& return_lse_base_on_e) {
   using DTypeO = typename KTraits::DTypeO;
   constexpr uint32_t NUM_MMA_D_CKV = KTraits::NUM_MMA_D_CKV;
   constexpr uint32_t HEAD_DIM_CKV = KTraits::HEAD_DIM_CKV;
@@ -744,6 +748,9 @@ __device__ __forceinline__ void write_o(typename KTraits::SharedStorage* smem_st
         num_heads.divmod(packed_offset + warp_idx_in_wg * 16 + 8 * j + lane_idx / 4, q, r);
         if (lane_idx % 4 == 0 && q < q_len) {
           final_lse[q * num_heads + r] = math::ptx_log2(d[j]) + float(m[j]);
+          if (return_lse_base_on_e) {
+            final_lse[q * num_heads + r] *= math::loge2;
+          }
         }
       }
     }
@@ -967,7 +974,7 @@ __global__ __launch_bounds__(KTraits::NUM_THREADS) void BatchMLAPagedAttentionKe
         final_lse ? final_lse + q_indptr * num_heads : nullptr,
         (partial_indptr == -1) ? nullptr : partial_o + partial_indptr * KTraits::HEAD_DIM_CKV,
         (partial_indptr == -1) ? nullptr : partial_lse + partial_indptr, o_frag, m, d, o_stride_n,
-        o_stride_h, qo_upperbound, qo_packed_idx_base, num_heads);
+        o_stride_h, qo_upperbound, qo_packed_idx_base, num_heads, params.return_lse_base_on_e);
   }
 
   auto grid = cg::this_grid();
@@ -978,7 +985,7 @@ __global__ __launch_bounds__(KTraits::NUM_THREADS) void BatchMLAPagedAttentionKe
       params.merge_packed_offset_start, params.merge_packed_offset_end,
       params.merge_partial_packed_offset_start, params.merge_partial_packed_offset_end,
       params.merge_partial_stride, partial_o, partial_lse, final_o, final_lse, o_stride_n,
-      o_stride_h, num_heads);
+      o_stride_h, num_heads, params.return_lse_base_on_e);
 }
 
 #define DISPATCH_SMEM_CONFIG(smem_limit_per_sm, NUM_STAGES, CTA_TILE_KV, QK_SHARD, ...) \
diff --git a/include/flashinfer/attention/mla_hopper.cuh b/include/flashinfer/attention/mla_hopper.cuh
index efcf660fbc..65b1d60573 100644
--- a/include/flashinfer/attention/mla_hopper.cuh
+++ b/include/flashinfer/attention/mla_hopper.cuh
@@ -455,7 +455,8 @@ __device__ __forceinline__ void write_o(typename KTraits::SharedStorage* smem_st
                                         float* partial_lse, float(*o_frag), float* m, float* d,
                                         const uint32_t o_stride_n, const uint32_t o_stride_h,
                                         const uint32_t q_len, const uint32_t packed_offset,
-                                        const uint_fastdiv& num_heads) {
+                                        const uint_fastdiv& num_heads,
+                                        const bool& return_lse_base_on_e) {
   using DTypeO = typename KTraits::DTypeO;
   constexpr uint32_t NUM_MMA_D_CKV = KTraits::NUM_MMA_D_CKV;
   constexpr uint32_t HEAD_DIM_CKV = KTraits::HEAD_DIM_CKV;
@@ -543,6 +544,9 @@ __device__ __forceinline__ void write_o(typename KTraits::SharedStorage* smem_st
           num_heads.divmod(packed_offset + warp_idx_in_wg * 16 + 8 * j + lane_idx / 4, q, r);
           if (lane_idx % 4 == 0 && q < q_len) {
             final_lse[q * num_heads + r] = math::ptx_log2(d[j]) + float(m[j]);
+            if (return_lse_base_on_e) {
+              final_lse[q * num_heads + r] *= math::loge2;
+            }
           }
         }
       }
@@ -796,7 +800,7 @@ __global__ __launch_bounds__(KTraits::NUM_THREADS) void BatchMLAPageAttentionHop
           final_lse ? final_lse + q_indptr * num_heads : nullptr,
           (partial_indptr == -1) ? nullptr : partial_o + partial_indptr * KTraits::HEAD_DIM_CKV,
           (partial_indptr == -1) ? nullptr : partial_lse + partial_indptr, o_frag, m, d, o_stride_n,
-          o_stride_h, qo_upperbound, qo_packed_idx_base, num_heads);
+          o_stride_h, qo_upperbound, qo_packed_idx_base, num_heads, params.return_lse_base_on_e);
       PROFILER_EVENT_END(variant, ProfileEventType::kWriteO);
       __syncthreads();
     }
@@ -936,7 +940,7 @@ __global__ __launch_bounds__(KTraits::NUM_THREADS) void BatchMLAPageAttentionHop
           final_lse ? final_lse + q_indptr * num_heads : nullptr,
           (partial_indptr == -1) ? nullptr : partial_o + partial_indptr * KTraits::HEAD_DIM_CKV,
           (partial_indptr == -1) ? nullptr : partial_lse + partial_indptr, o_frag, m, d, o_stride_n,
-          o_stride_h, qo_upperbound, qo_packed_idx_base, num_heads);
+          o_stride_h, qo_upperbound, qo_packed_idx_base, num_heads, params.return_lse_base_on_e);
       PROFILER_EVENT_END(variant, ProfileEventType::kWriteO);
       __syncthreads();
     }
@@ -953,7 +957,7 @@ __global__ __launch_bounds__(KTraits::NUM_THREADS) void BatchMLAPageAttentionHop
       params.merge_packed_offset_start, params.merge_packed_offset_end,
       params.merge_partial_packed_offset_start, params.merge_partial_packed_offset_end,
       params.merge_partial_stride, partial_o, partial_lse, final_o, final_lse, o_stride_n,
-      o_stride_h, num_heads);
+      o_stride_h, num_heads, params.return_lse_base_on_e);
 
   PROFILER_EVENT_END(variant, ProfileEventType::kSplitK);
 }
diff --git a/include/flashinfer/attention/mla_params.cuh b/include/flashinfer/attention/mla_params.cuh
index ff5d168ba2..6da1ed7f53 100644
--- a/include/flashinfer/attention/mla_params.cuh
+++ b/include/flashinfer/attention/mla_params.cuh
@@ -71,6 +71,7 @@ struct MLAParams {
   uint32_t o_stride_h;
 
   float sm_scale;
+  bool return_lse_base_on_e;
 };
 
 };  // namespace flashinfer
diff --git a/include/flashinfer/attention/scheduler.cuh b/include/flashinfer/attention/scheduler.cuh
index 4f888e716b..286023e204 100644
--- a/include/flashinfer/attention/scheduler.cuh
+++ b/include/flashinfer/attention/scheduler.cuh
@@ -443,7 +443,6 @@ inline cudaError_t DecodePlan(void* float_buffer, size_t float_workspace_size_in
   padded_batch_size =
       (enable_cuda_graph) ? (split_kv ? max_grid_size / gdy : batch_size) : new_batch_size;
   plan_info.padded_batch_size = padded_batch_size;
-
   auto [request_indices_vec, kv_tile_indices_vec, o_indptr_vec] =
       DecodeSplitKVIndptr(indptr_h, batch_size, kv_chunk_size_in_pages);
 
@@ -700,6 +699,8 @@ inline cudaError_t PrefillPlan(void* float_buffer, size_t float_workspace_size_i
                                uint32_t head_dim_qk, uint32_t head_dim_vo, uint32_t page_size,
                                bool enable_cuda_graph, uint32_t sizeof_dtype_o, int32_t window_left,
                                int32_t fixed_split_size, bool disable_split_kv,
+                               int64_t num_colocated_ctas,  // for POD attention, limit prefill
+                                                            // splits by #colocated decode CTAs
                                cudaStream_t stream) {
   if (num_qo_heads % num_kv_heads != 0) {
     std::ostringstream err_msg;
@@ -714,7 +715,8 @@ inline cudaError_t PrefillPlan(void* float_buffer, size_t float_workspace_size_i
   FLASHINFER_CUDA_CALL(cudaGetDevice(&dev_id));
   FLASHINFER_CUDA_CALL(cudaDeviceGetAttribute(&num_sm, cudaDevAttrMultiProcessorCount, dev_id));
   int num_blocks_per_sm = 2;
-  int max_grid_size = num_blocks_per_sm * num_sm;
+  int64_t available_ctas = static_cast<int64_t>(num_blocks_per_sm) * num_sm - num_colocated_ctas;
+  int max_grid_size = static_cast<int>(std::max<int64_t>(0, available_ctas));
   uint32_t max_batch_size_if_split = max_grid_size / num_kv_heads;
 
   // step 2: determine kv_chunk_size
diff --git a/include/flashinfer/attention/variant_helper.cuh b/include/flashinfer/attention/variant_helper.cuh
index 65a9f19249..bf97db3839 100644
--- a/include/flashinfer/attention/variant_helper.cuh
+++ b/include/flashinfer/attention/variant_helper.cuh
@@ -20,8 +20,12 @@
 
 #include <cstdint>
 
+#include "../utils.cuh"
+
 namespace flashinfer {
 
+DEFINE_HAS_MEMBER(v_scale)
+
 #define REGISTER_QUERY_TRANSFORM(params, q, ...)                                    \
   template <typename Params, typename T>                                            \
   __device__ __forceinline__ T QueryTransform(const Params& params, void* q_smem) { \
@@ -80,8 +84,19 @@ struct AttentionVariantBase {
 
   REGISTER_OUTPUT_TRANSFORM(params, output, batch_idx, qo_idx, qo_head_idx, m, d, scale, {
     float d_rcp = (m != -math::inf) ? math::ptx_rcp(d) : 0.f;
-    return output * d_rcp;
+    float v_scale_val = get_v_scale(params);
+    return output * d_rcp * v_scale_val;
   })
+
+  // Helper to get v_scale from params, returns 1.0f if not present
+  template <typename Params>
+  __device__ __forceinline__ static float get_v_scale(const Params& params) {
+    if constexpr (has_v_scale_v<Params>) {
+      return params.v_scale;
+    } else {
+      return 1.0f;
+    }
+  }
 };
 
 }  // namespace flashinfer
diff --git a/include/flashinfer/gemm/dsv3_router_gemm.cuh b/include/flashinfer/gemm/dsv3_router_gemm.cuh
new file mode 100644
index 0000000000..aef712d68e
--- /dev/null
+++ b/include/flashinfer/gemm/dsv3_router_gemm.cuh
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace flashinfer::trtllm_dsv3_router_gemm {
+// Custom FMA implementation using PTX assembly instructions
+__device__ __forceinline__ void fma(float2& d, float2 const& a, float2 const& b, float2 const& c) {
+  asm volatile("fma.rn.f32x2 %0, %1, %2, %3;\n"
+               : "=l"(reinterpret_cast<uint64_t&>(d))
+               : "l"(reinterpret_cast<uint64_t const&>(a)),
+                 "l"(reinterpret_cast<uint64_t const&>(b)),
+                 "l"(reinterpret_cast<uint64_t const&>(c)));
+}
+
+// Convert 8 bfloat16 values from a uint4 to float array - optimized conversion
+template <int VPT>
+__device__ __forceinline__ void bf16_uint4_to_float8(uint4 const& vec, float* dst) {
+  __nv_bfloat16* bf16_ptr = reinterpret_cast<__nv_bfloat16*>(const_cast<uint4*>(&vec));
+
+#pragma unroll
+  for (int i = 0; i < VPT; i++) {
+    dst[i] = __bfloat162float(bf16_ptr[i]);
+  }
+}
+
+template <typename T, int kBlockSize, int VPT, int kNumTokens, int kNumExperts, int kHiddenDim>
+__global__ __launch_bounds__(128, 1) void router_gemm_kernel(float* out, T const* mat_a,
+                                                             T const* mat_b) {
+  // Each block handles one expert column
+  int const n_idx = blockIdx.x;
+  int const tid = threadIdx.x;
+  constexpr int kWarpSize = 32;
+  constexpr int kNumWarps = kBlockSize / kWarpSize;
+  // Constants for this kernel
+  constexpr int k_elems_per_k_iteration = VPT * kBlockSize;
+  constexpr int k_iterations = kHiddenDim / k_elems_per_k_iteration;  // Total K iterations
+
+  // Initialize accumulators for all M rows
+  float acc[kNumTokens] = {};
+
+  // Shared memory for warp-level reduction
+  __shared__ float sm_reduction[kNumTokens][kNumWarps];  // kNumWarps
+
+  // B matrix is in column-major order, so we can directly load a column for the n_idx expert
+  T const* b_col = mat_b + n_idx * kHiddenDim;
+
+  // Pre-compute k_base values for each iteration to help compiler optimize
+  // int k_bases[k_iterations];
+  int k_bases[k_iterations];
+#pragma unroll
+  for (int ki = 0; ki < k_iterations; ki++) {
+    k_bases[ki] = ki * k_elems_per_k_iteration + tid * VPT;
+  }
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.wait;");
+#endif
+
+  // Process the GEMM in chunks
+  for (int ki = 0; ki < k_iterations; ki++) {
+    int const k_base = k_bases[ki];
+
+    // Load B matrix values using vector load (8 bf16 values)
+    uint4 b_vec = *reinterpret_cast<uint4 const*>(b_col + k_base);
+
+    // Convert B values to float
+    float b_float[VPT];
+    bf16_uint4_to_float8<VPT>(b_vec, b_float);
+
+// Process each token
+#pragma unroll
+    for (int m_idx = 0; m_idx < kNumTokens; m_idx++) {
+      // Load both rows of A matrix using vector loads
+      uint4 a_vec = *reinterpret_cast<uint4 const*>(mat_a + (m_idx * kHiddenDim) + k_base);
+
+      // Convert A values to float
+      float a_float[VPT];
+      bf16_uint4_to_float8<VPT>(a_vec, a_float);
+
+// Process elements in this chunk
+#pragma unroll
+      for (int k = 0; k < VPT; k++) {
+        float a = a_float[k];
+        float b = b_float[k];
+        acc[m_idx] += a * b;
+      }
+    }
+  }
+
+  // Perform warp-level reduction
+  int const warpSize = 32;
+  int const warpId = tid / warpSize;
+  int const laneId = tid % warpSize;
+
+  // Register for warp-level reduction results
+  float warp_result[kNumTokens];
+
+#pragma unroll
+  for (int m_idx = 0; m_idx < kNumTokens; m_idx++) {
+    warp_result[m_idx] = acc[m_idx];
+  }
+
+// Perform warp-level reduction using optimized butterfly pattern
+#pragma unroll
+  for (int m = 0; m < kNumTokens; m++) {
+    float sum = warp_result[m];
+
+    // Butterfly reduction pattern
+    sum += __shfl_xor_sync(0xffffffff, sum, 16);
+    sum += __shfl_xor_sync(0xffffffff, sum, 8);
+    sum += __shfl_xor_sync(0xffffffff, sum, 4);
+    sum += __shfl_xor_sync(0xffffffff, sum, 2);
+    sum += __shfl_xor_sync(0xffffffff, sum, 1);
+
+    // Only the first thread in each warp stores to shared memory
+    if (laneId == 0) {
+      sm_reduction[m][warpId] = sum;
+    }
+  }
+
+  __syncthreads();
+
+  // Final reduction across warps (only first thread)
+  if (tid == 0) {
+#pragma unroll
+    for (int m = 0; m < kNumTokens; m++) {
+      float final_sum = 0.0f;
+
+// Sum across the kNumWarps
+#pragma unroll
+      for (int w = 0; w < kNumWarps; w++) {
+        final_sum += sm_reduction[m][w];
+      }
+
+      // Write final result
+      out[m * kNumExperts + n_idx] = final_sum;
+    }
+  }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+}  // namespace flashinfer::trtllm_dsv3_router_gemm
diff --git a/include/flashinfer/gemm/fp4_gemm_template_sm100.h b/include/flashinfer/gemm/fp4_gemm_template_sm100.h
index 3fa40ff9bd..5152e6e296 100644
--- a/include/flashinfer/gemm/fp4_gemm_template_sm100.h
+++ b/include/flashinfer/gemm/fp4_gemm_template_sm100.h
@@ -273,7 +273,7 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
                            std::string(cutlassGetStatusString(initStatus));                                  \
       throw std::runtime_error("[FP4 gemm Runner] " + errMsg);                                               \
     }                                                                                                        \
-    auto runStatus = gemm.run(args, workspace, stream, nullptr, /* enablePDL */ false);                      \
+    auto runStatus = gemm.run(args, workspace, stream, nullptr, /*enablePDL=*/true);                         \
     if (runStatus != cutlass::Status::kSuccess) {                                                            \
       std::string errMsg = "Failed to run cutlass FP4 gemm on sm100. Error: " +                              \
                            std::string(cutlassGetStatusString(runStatus));                                   \
diff --git a/include/flashinfer/gemm/fp4_gemm_template_sm120.h b/include/flashinfer/gemm/fp4_gemm_template_sm120.h
index 93b082bf5d..7333d81743 100644
--- a/include/flashinfer/gemm/fp4_gemm_template_sm120.h
+++ b/include/flashinfer/gemm/fp4_gemm_template_sm120.h
@@ -257,7 +257,7 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
                            std::string(cutlass::cutlassGetStatusString(initStatus));                       \
       throw std::runtime_error("[FP4 gemm Runner] " + errMsg);                                             \
     }                                                                                                      \
-    auto runStatus = gemm.run(args, workspace, stream, nullptr, /* enablePDL */ false);                    \
+    auto runStatus = gemm.run(args, workspace, stream, nullptr, /*enablePDL=*/true);                       \
     if (runStatus != cutlass::Status::kSuccess) {                                                          \
       std::string errMsg = "Failed to run cutlass FP4 gemm on sm120. Error: " +                            \
                            std::string(cutlass::cutlassGetStatusString(runStatus));                        \
diff --git a/include/flashinfer/page.cuh b/include/flashinfer/page.cuh
index 1f5d328da8..efc224b4e1 100644
--- a/include/flashinfer/page.cuh
+++ b/include/flashinfer/page.cuh
@@ -283,55 +283,6 @@ __global__ void AppendPagedKVCacheKernel(paged_kv_t<DType, IdType> paged_kv,
   }
 }
 
-template <typename IdType>
-__global__ void BlockSparseIndicesToVectorSparseOffsetsKernel(
-    IdType* __restrict__ block_sparse_indices, IdType* __restrict__ block_sparse_indptr,
-    IdType* __restrict__ vector_sparse_offsets, IdType* __restrict__ vector_sparse_indptr,
-    IdType* __restrict__ kv_lens, const uint32_t stride_block, const uint32_t stride_n,
-    const uint32_t batch_size, const uint_fastdiv block_size) {
-#pragma unroll 1
-  for (int b = blockIdx.x; b < batch_size; ++b) {
-#pragma unroll 2
-    for (int pos = threadIdx.x; pos < kv_lens[b]; pos += blockDim.x) {
-      uint32_t q, r;
-      block_size.divmod(pos, q, r);
-      vector_sparse_offsets[vector_sparse_indptr[b] + pos] =
-          block_sparse_indices[block_sparse_indptr[b] + q] * stride_block + r * stride_n;
-    }
-  }
-}
-
-template <typename IdType>
-cudaError_t BlockSparseIndicesToVectorSparseOffset(
-    IdType* block_sparse_indices, IdType* block_sparse_indptr, IdType* vector_sparse_offsets,
-    IdType* vector_sparse_indptr, IdType* kv_lens, const int64_t stride_block,
-    const int64_t stride_n, const int64_t batch_size, const uint32_t block_size,
-    cudaStream_t stream = nullptr) {
-  int dev_id = 0;
-  int num_sms = 0;
-  FLASHINFER_CUDA_CALL(cudaGetDevice(&dev_id));
-  FLASHINFER_CUDA_CALL(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
-
-  uint32_t num_threads = 512;
-
-  uint_fastdiv block_size_fastdiv(block_size);
-
-  auto kernel = BlockSparseIndicesToVectorSparseOffsetsKernel<IdType>;
-  void* args[] = {(void*)&block_sparse_indices,
-                  (void*)&block_sparse_indptr,
-                  (void*)&vector_sparse_offsets,
-                  (void*)&vector_sparse_indptr,
-                  (void*)&kv_lens,
-                  (void*)&stride_block,
-                  (void*)&stride_n,
-                  (void*)&batch_size,
-                  (void*)&block_size_fastdiv};
-
-  FLASHINFER_CUDA_CALL(cudaLaunchKernel((void*)kernel, num_sms, num_threads, args, 0, stream));
-
-  return cudaSuccess;
-}
-
 /*!
  * \brief Append new keys/values to the paged key-value cache in the decode phase
  * \tparam DType The data type of the key-value cache
diff --git a/include/flashinfer/pos_enc.cuh b/include/flashinfer/pos_enc.cuh
index 7547a06090..7901b71e22 100644
--- a/include/flashinfer/pos_enc.cuh
+++ b/include/flashinfer/pos_enc.cuh
@@ -20,14 +20,40 @@
 #include <cstdint>
 #include <iostream>
 #include <string>
+#include <type_traits>
 
 #include "layout.cuh"
 #include "math.cuh"
+#include "page.cuh"
 #include "utils.cuh"
 #include "vec_dtypes.cuh"
 
 namespace flashinfer {
 
+struct RopeQuantizeAppendPagedKVCacheParams {
+  uint32_t nnz;
+  uint32_t num_qo_heads;
+  uint32_t num_kv_heads;
+  uint32_t rope_dim;
+  uint32_t no_rope_dim;
+  size_t q_rope_in_stride_n;
+  size_t q_rope_in_stride_h;
+  size_t q_nope_in_stride_n;
+  size_t q_nope_in_stride_h;
+  size_t q_rope_out_stride_n;
+  size_t q_rope_out_stride_h;
+  size_t q_nope_out_stride_n;
+  size_t q_nope_out_stride_h;
+  size_t k_rope_in_stride;
+  size_t k_rope_in_stride_h;
+  size_t k_nope_in_stride;
+  size_t k_nope_in_stride_h;
+  size_t v_in_stride;
+  size_t v_in_stride_h;
+  float quant_scale_q;
+  float quant_scale_kv;
+};
+
 /*!
  * \brief An enumeration class that defines different modes for applying RoPE
  *   (Rotary Positional Embeddings).
@@ -384,7 +410,7 @@ __global__ void RopeQuantizeKernel(
     // 2. if not interleave
     //  - cos = cos_cache[pos_id][(tx * vec_size) % (rot_dim // 2)]
     //  - sin = sin_cache[pos_id][(rot_dim // 2) + (tx * vec_size) % (rot_dim // 2)]
-    if ((tx * vec_size < rope_dim) and (by < k_rope_end)) {
+    if ((tx * vec_size < rope_dim) && (by < k_rope_end)) {
       int sin_offset = rope_dim / 2;
       int vec_idx;
       if constexpr (interleave) {
@@ -717,34 +743,237 @@ __global__ void BatchQKApplyRotaryKernel(
   }
 }
 
-#define DISPATCH_INTERLEAVE(interleave, INTERLEAVE, ...) \
-  if (interleave) {                                      \
-    const bool INTERLEAVE = true;                        \
-    __VA_ARGS__                                          \
-  } else {                                               \
-    const bool INTERLEAVE = false;                       \
-    __VA_ARGS__                                          \
-  }
+/*!
+ * \brief Unified CUDA kernel to apply RoPE, quantize to FP8, and append to paged cache.
+ *
+ * Templated on CacheT to support both GQA/MHA (paged_kv_t) and MLA (paged_kv_mla_t).
+ * Cache-only behaviors are selected with constexpr on the CacheT.
+ */
+template <bool interleave, uint32_t vec_size, uint32_t bdx, typename DType, typename IdType,
+          typename QuantType, typename CacheT>
+__global__ void RopeQuantizeAppendPagedKVCacheKernel(
+    DType* q_rope_in, DType* k_rope_in, DType* q_nope_in, DType* k_nope_in, DType* v_in,
+    QuantType* q_rope_out, QuantType* q_nope_out, CacheT paged_kv_like,
+    IdType* __restrict__ batch_indices, IdType* __restrict__ positions,
+    float* __restrict__ cos_sin_cache, IdType* __restrict__ pos_ids,
+    const RopeQuantizeAppendPagedKVCacheParams params) {
+#if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.wait;");
+#endif
+  uint32_t bx = blockIdx.x, tx = threadIdx.x, ty = threadIdx.y;
+  uint32_t by = blockIdx.y;
+  uint32_t bdy = blockDim.y;
+
+  // Local aliases for params for readability
+  const uint32_t nnz = params.nnz;
+  const uint32_t num_qo_heads = params.num_qo_heads;
+  const uint32_t num_kv_heads = params.num_kv_heads;
+  const uint32_t rope_dim = params.rope_dim;
+  const uint32_t no_rope_dim = params.no_rope_dim;
+  const size_t q_rope_in_stride_n = params.q_rope_in_stride_n;
+  const size_t q_rope_in_stride_h = params.q_rope_in_stride_h;
+  const size_t q_nope_in_stride_n = params.q_nope_in_stride_n;
+  const size_t q_nope_in_stride_h = params.q_nope_in_stride_h;
+  const size_t q_rope_out_stride_n = params.q_rope_out_stride_n;
+  const size_t q_rope_out_stride_h = params.q_rope_out_stride_h;
+  const size_t q_nope_out_stride_n = params.q_nope_out_stride_n;
+  const size_t q_nope_out_stride_h = params.q_nope_out_stride_h;
+  const size_t k_rope_in_stride = params.k_rope_in_stride;
+  const size_t k_rope_in_stride_h = params.k_rope_in_stride_h;
+  const size_t k_nope_in_stride = params.k_nope_in_stride;
+  const size_t k_nope_in_stride_h = params.k_nope_in_stride_h;
+  const size_t v_in_stride = params.v_in_stride;
+  const size_t v_in_stride_h = params.v_in_stride_h;
+  const float quant_scale_q = params.quant_scale_q;
+  const float quant_scale_kv = params.quant_scale_kv;
+
+  // Calculate flexible boundaries for block allocation
+  uint32_t rope_chunk_size = rope_dim;
+  uint32_t rope_chunks = (rope_dim + rope_chunk_size - 1) / rope_chunk_size;
+  uint32_t no_rope_chunks = (no_rope_dim + rope_chunk_size - 1) / rope_chunk_size;
+
+  uint32_t q_rope_end = num_qo_heads * rope_chunks;
+  // For MLA, num_kv_heads is effectively 1
+  uint32_t k_rope_end = q_rope_end + num_kv_heads * rope_chunks;
+  uint32_t k_nope_end = k_rope_end + num_kv_heads * no_rope_chunks;
+
+  // Deduce MLA vs GQA/MHA from CacheT
+  constexpr bool IS_MLA = std::is_same<CacheT, paged_kv_mla_t<QuantType, IdType>>::value;
+
+  vec_t<float, vec_size> cos, sin;
+  if (bx * bdy + ty < nnz) {
+    const uint32_t idx = bx * bdy + ty;
+    const IdType pos = pos_ids[idx];
+
+    // Compute page location for this token
+    uint32_t page_iter, entry_idx;
+    paged_kv_like.page_size.divmod(
+        paged_kv_like.indptr[batch_indices[idx]] * paged_kv_like.page_size + positions[idx],
+        page_iter, entry_idx);
+
+    const int half_rope_dim = rope_dim / 2;
+    // Load cos/sin for RoPE processing blocks only
+    if ((tx * vec_size < rope_dim) && (by < k_rope_end)) {
+      int sin_offset = rope_dim / 2;
+      int vec_idx;
+      if constexpr (interleave) {
+        vec_idx = (tx * vec_size) / 2;  // Force integer division
+      } else {
+        vec_idx = (tx * vec_size) % half_rope_dim;
+      }
+      cos.load(cos_sin_cache + (pos * rope_dim) + vec_idx);
+      sin.load(cos_sin_cache + (pos * rope_dim) + (sin_offset + vec_idx));
+    }
+
+    if (by < q_rope_end) {
+      // ============ Q RoPE processing ============
+      uint32_t q_head_idx = by / rope_chunks;
+      uint32_t rope_chunk_idx = by % rope_chunks;
+      uint32_t elem_offset = rope_chunk_idx * rope_chunk_size;
+
+      DType* q_rope_in_ptr =
+          q_rope_in + get_elem_offset_impl(idx, q_head_idx, elem_offset, q_rope_in_stride_n,
+                                           q_rope_in_stride_h);
+      QuantType* q_rope_out_ptr =
+          q_rope_out + get_elem_offset_impl(idx, q_head_idx, elem_offset, q_rope_out_stride_n,
+                                            q_rope_out_stride_h);
+
+      vec_t<float, vec_size> q_rope_vec;
+      if constexpr (interleave) {
+        q_rope_vec = vec_apply_llama_rope_cos_sin_interleave_reuse_half<vec_size, bdx>(
+            q_rope_in_ptr, cos, sin, rope_dim);
+      } else {
+        q_rope_vec = vec_apply_llama_rope_cos_sin<vec_size, bdx>(q_rope_in_ptr, cos, sin, rope_dim);
+      }
+#pragma unroll
+      for (uint32_t i = 0; i < vec_size; ++i) {
+        q_rope_vec[i] = q_rope_vec[i] * quant_scale_q;
+      }
+      q_rope_vec.cast_store(q_rope_out_ptr + tx * vec_size);
+
+    } else if (by < k_rope_end) {
+      // ============ K RoPE processing & Cache Append ============
+      uint32_t k_head_idx = (by - q_rope_end) / rope_chunks;
+      uint32_t rope_chunk_idx = (by - q_rope_end) % rope_chunks;
+      uint32_t elem_offset = rope_chunk_idx * rope_chunk_size;
+
+      DType* k_rope_in_ptr;
+      if constexpr (IS_MLA) {
+        // MLA: 2D K
+        k_rope_in_ptr = k_rope_in + idx * k_rope_in_stride + elem_offset;
+      } else {
+        // GQA/MHA: 3D K
+        k_rope_in_ptr = k_rope_in + get_elem_offset_impl(idx, k_head_idx, elem_offset,
+                                                         k_rope_in_stride, k_rope_in_stride_h);
+      }
+
+      vec_t<float, vec_size> k_rope_vec;
+      if constexpr (interleave) {
+        k_rope_vec = vec_apply_llama_rope_cos_sin_interleave_reuse_half<vec_size, bdx>(
+            k_rope_in_ptr, cos, sin, rope_dim);
+      } else {
+        k_rope_vec = vec_apply_llama_rope_cos_sin<vec_size, bdx>(k_rope_in_ptr, cos, sin, rope_dim);
+      }
+#pragma unroll
+      for (uint32_t i = 0; i < vec_size; ++i) {
+        k_rope_vec[i] = k_rope_vec[i] * quant_scale_kv;
+      }
+
+      if constexpr (IS_MLA) {
+        QuantType* kpe_ptr =
+            paged_kv_like.get_kpe_ptr(page_iter, entry_idx, elem_offset + tx * vec_size);
+        k_rope_vec.cast_store(kpe_ptr);
+      } else {
+        QuantType* k_ptr = paged_kv_like.get_k_ptr(page_iter, k_head_idx, entry_idx, tx * vec_size);
+        k_rope_vec.cast_store(k_ptr);
+      }
+
+    } else if (by < k_nope_end) {
+      // ============ K Non-RoPE processing & Cache Append ============
+      uint32_t k_head_idx = (by - k_rope_end) / no_rope_chunks;
+      uint32_t nope_chunk_idx = (by - k_rope_end) % no_rope_chunks;
+      uint32_t elem_offset = nope_chunk_idx * rope_chunk_size;
+
+      DType* k_nope_in_ptr;
+      if constexpr (IS_MLA) {
+        k_nope_in_ptr = k_nope_in + idx * k_nope_in_stride + elem_offset;
+      } else {
+        k_nope_in_ptr = k_nope_in + get_elem_offset_impl(idx, k_head_idx, elem_offset,
+                                                         k_nope_in_stride, k_nope_in_stride_h);
+      }
+
+      vec_t<float, vec_size> k_nope_vec;
+      k_nope_vec.cast_load(k_nope_in_ptr + tx * vec_size);
+#pragma unroll
+      for (uint32_t i = 0; i < vec_size; ++i) {
+        k_nope_vec[i] = k_nope_vec[i] * quant_scale_kv;
+      }
 
-#define DISPATCH_ROPE_DIM(rope_dim, vec_size, ...)                                    \
-  if (rope_dim == 16) {                                                               \
-    constexpr uint32_t bdx = 16 / vec_size;                                           \
-    __VA_ARGS__                                                                       \
-  } else if (rope_dim == 32) {                                                        \
-    constexpr uint32_t bdx = 32 / vec_size;                                           \
-    __VA_ARGS__                                                                       \
-  } else if (rope_dim == 64) {                                                        \
-    constexpr uint32_t bdx = 64 / vec_size;                                           \
-    __VA_ARGS__                                                                       \
-  } else if (rope_dim == 128) {                                                       \
-    constexpr uint32_t bdx = 128 / vec_size;                                          \
-    __VA_ARGS__                                                                       \
-  } else if (rope_dim == 256) {                                                       \
-    constexpr uint32_t bdx = 256 / vec_size;                                          \
-    __VA_ARGS__                                                                       \
-  } else {                                                                            \
-    FLASHINFER_ERROR("Unsupported rope_dim. Supported values: 16, 32, 64, 128, 256"); \
+      if constexpr (IS_MLA) {
+        QuantType* ckv_ptr =
+            paged_kv_like.get_ckv_ptr(page_iter, entry_idx, elem_offset + tx * vec_size);
+        k_nope_vec.cast_store(ckv_ptr);
+      } else {
+        QuantType* k_ptr = paged_kv_like.get_k_ptr(page_iter, k_head_idx, entry_idx,
+                                                   rope_dim + elem_offset + tx * vec_size);
+        k_nope_vec.cast_store(k_ptr);
+      }
+
+    } else if (by < k_nope_end + (IS_MLA ? 0u : num_kv_heads)) {
+      // ============ V processing & Cache Append (GQA/MHA only) ============
+      if constexpr (!IS_MLA) {
+        uint32_t kv_head_idx = by - k_nope_end;
+        DType* v_in_ptr =
+            v_in + get_elem_offset_impl(idx, kv_head_idx, 0, v_in_stride, v_in_stride_h);
+        // Cover the full head dimension (rope_dim + no_rope_dim) in chunks of rope_chunk_size
+        uint32_t head_dim_total = rope_dim + no_rope_dim;
+        uint32_t v_chunks = (head_dim_total + rope_chunk_size - 1) / rope_chunk_size;
+#pragma unroll 1
+        for (uint32_t j = 0; j < v_chunks; ++j) {
+          uint32_t v_elem_offset = j * rope_chunk_size;
+          if (v_elem_offset + tx * vec_size < head_dim_total) {
+            vec_t<float, vec_size> v_vec;
+            v_vec.cast_load(v_in_ptr + v_elem_offset + tx * vec_size);
+#pragma unroll
+            for (uint32_t i = 0; i < vec_size; ++i) {
+              v_vec[i] = v_vec[i] * quant_scale_kv;
+            }
+            QuantType* v_ptr = paged_kv_like.get_v_ptr(page_iter, kv_head_idx, entry_idx,
+                                                       v_elem_offset + tx * vec_size);
+            v_vec.cast_store(v_ptr);
+          }
+        }
+      }
+
+    } else {
+      // ============ Q Non-RoPE processing ============
+      // MLA has no V section, so Q-nope starts immediately after K-nope.
+      // GQA/MHA has a V section of length num_kv_heads blocks.
+      uint32_t q_nope_start = k_nope_end + (IS_MLA ? 0u : num_kv_heads);
+      uint32_t q_head_idx = (by - q_nope_start) / no_rope_chunks;
+      uint32_t nope_chunk_idx = (by - q_nope_start) % no_rope_chunks;
+      uint32_t elem_offset = nope_chunk_idx * rope_chunk_size;
+
+      DType* q_nope_in_ptr =
+          q_nope_in + get_elem_offset_impl(idx, q_head_idx, elem_offset, q_nope_in_stride_n,
+                                           q_nope_in_stride_h);
+      QuantType* q_nope_out_ptr =
+          q_nope_out + get_elem_offset_impl(idx, q_head_idx, elem_offset, q_nope_out_stride_n,
+                                            q_nope_out_stride_h);
+
+      vec_t<float, vec_size> q_nope_vec;
+      q_nope_vec.cast_load(q_nope_in_ptr + tx * vec_size);
+#pragma unroll
+      for (uint32_t i = 0; i < vec_size; ++i) {
+        q_nope_vec[i] = q_nope_vec[i] * quant_scale_q;
+      }
+      q_nope_vec.cast_store(q_nope_out_ptr + tx * vec_size);
+    }
   }
+#if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
 
 template <typename DType, typename IdType, typename QuantType>
 cudaError_t RopeQuantize(
@@ -763,11 +992,11 @@ cudaError_t RopeQuantize(
   FLASHINFER_CUDA_CALL(cudaGetDevice(&dev_id));
   FLASHINFER_CUDA_CALL(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
 
-  constexpr uint32_t vec_size = 32 / sizeof(DType);
-
   // Use nested macros for runtime->compile-time dispatch for required constexpr values
-  DISPATCH_ROPE_DIM(rope_dim, vec_size, {
+  DISPATCH_ROPE_DIM(rope_dim, ROPE_DIM, {
     DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
+      constexpr uint32_t vec_size = 32 / sizeof(DType);
+      constexpr uint32_t bdx = ROPE_DIM / vec_size;
       uint32_t num_threads = 128U;
       uint32_t bdy = num_threads / bdx;
       uint32_t nblks_x = (nnz + bdy - 1) / bdy;
@@ -838,6 +1067,185 @@ cudaError_t RopeQuantize(
   return cudaSuccess;
 }
 
+/*!
+ * \brief Host function to apply RoPE, quantize to FP8, and append K/V to paged cache (GQA/MHA)
+ */
+template <typename DType, typename IdType, typename QuantType>
+cudaError_t RopeQuantizeAppendPagedKVCache(
+    DType* q_rope_in, DType* k_rope_in, DType* q_nope_in, DType* k_nope_in, DType* v_in,
+    QuantType* q_rope_out, QuantType* q_nope_out, paged_kv_t<QuantType, IdType> paged_kv,
+    IdType* batch_indices, IdType* positions, float* cos_sin_cache, IdType* pos_ids, uint32_t nnz,
+    uint32_t num_qo_heads, uint32_t num_kv_heads, uint32_t rope_dim, uint32_t no_rope_dim,
+    size_t q_rope_in_stride_n, size_t q_rope_in_stride_h, size_t q_nope_in_stride_n,
+    size_t q_nope_in_stride_h, size_t q_rope_out_stride_n, size_t q_rope_out_stride_h,
+    size_t q_nope_out_stride_n, size_t q_nope_out_stride_h, size_t k_rope_in_stride,
+    size_t k_rope_in_stride_h, size_t k_nope_in_stride, size_t k_nope_in_stride_h,
+    size_t v_in_stride, size_t v_in_stride_h, float quant_scale_q, float quant_scale_kv,
+    bool interleave, bool enable_pdl = false, cudaStream_t stream = nullptr) {
+  DISPATCH_ROPE_DIM(rope_dim, ROPE_DIM, {
+    DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
+      constexpr uint32_t vec_size = 32 / sizeof(DType);
+      constexpr uint32_t bdx = ROPE_DIM / vec_size;
+      uint32_t num_threads = 128U;
+      uint32_t bdy = num_threads / bdx;
+      uint32_t nblks_x = (nnz + bdy - 1) / bdy;
+      uint32_t rope_chunks = 1;
+      uint32_t no_rope_chunks = (no_rope_dim + rope_dim - 1) / rope_dim;
+
+      // GQA/MHA: Q rope + K rope + K nope + V + Q nope
+      uint32_t total_blocks_y = num_qo_heads * rope_chunks + num_kv_heads * rope_chunks +
+                                num_kv_heads * no_rope_chunks + num_kv_heads +
+                                num_qo_heads * no_rope_chunks;
+
+      dim3 nblks(nblks_x, total_blocks_y);
+      dim3 nthrs(bdx, bdy);
+
+      cudaLaunchAttribute attribute[1];
+      attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+      attribute[0].val.programmaticStreamSerializationAllowed = enable_pdl ? 1 : 0;
+      cudaLaunchConfig_t config;
+      config.gridDim = nblks;
+      config.blockDim = nthrs;
+      config.stream = stream;
+      config.dynamicSmemBytes = 0;
+      config.attrs = attribute;
+      config.numAttrs = 1;
+
+      auto kernel = RopeQuantizeAppendPagedKVCacheKernel<INTERLEAVE, vec_size, bdx, DType, IdType,
+                                                         QuantType, paged_kv_t<QuantType, IdType>>;
+      RopeQuantizeAppendPagedKVCacheParams params;
+      params.nnz = nnz;
+      params.num_qo_heads = num_qo_heads;
+      params.num_kv_heads = num_kv_heads;
+      params.rope_dim = rope_dim;
+      params.no_rope_dim = no_rope_dim;
+      params.q_rope_in_stride_n = q_rope_in_stride_n;
+      params.q_rope_in_stride_h = q_rope_in_stride_h;
+      params.q_nope_in_stride_n = q_nope_in_stride_n;
+      params.q_nope_in_stride_h = q_nope_in_stride_h;
+      params.q_rope_out_stride_n = q_rope_out_stride_n;
+      params.q_rope_out_stride_h = q_rope_out_stride_h;
+      params.q_nope_out_stride_n = q_nope_out_stride_n;
+      params.q_nope_out_stride_h = q_nope_out_stride_h;
+      params.k_rope_in_stride = k_rope_in_stride;
+      params.k_rope_in_stride_h = k_rope_in_stride_h;
+      params.k_nope_in_stride = k_nope_in_stride;
+      params.k_nope_in_stride_h = k_nope_in_stride_h;
+      params.v_in_stride = v_in_stride;
+      params.v_in_stride_h = v_in_stride_h;
+      params.quant_scale_q = quant_scale_q;
+      params.quant_scale_kv = quant_scale_kv;
+
+      FLASHINFER_CUDA_CALL(cudaLaunchKernelEx(&config, kernel,
+                                              // inputs
+                                              q_rope_in, k_rope_in, q_nope_in, k_nope_in, v_in,
+                                              // q outputs
+                                              q_rope_out, q_nope_out,
+                                              // cache + indices
+                                              paged_kv, batch_indices, positions,
+                                              // rope tables
+                                              cos_sin_cache, pos_ids,
+                                              // params
+                                              params));
+    });
+  });
+
+  return cudaSuccess;
+}
+
+/*!
+ * \brief Host function to apply RoPE, quantize to FP8, and append to MLA paged cache
+ */
+template <typename DType, typename IdType, typename QuantType>
+cudaError_t RopeQuantizeAppendPagedMLACache(
+    DType* q_rope_in, DType* k_rope_in, DType* q_nope_in, DType* k_nope_in, QuantType* q_rope_out,
+    QuantType* q_nope_out, paged_kv_mla_t<QuantType, IdType> paged_kv_mla, IdType* batch_indices,
+    IdType* positions, float* cos_sin_cache, IdType* pos_ids, uint32_t nnz, uint32_t num_qo_heads,
+    uint32_t rope_dim, uint32_t no_rope_dim, size_t q_rope_in_stride_n, size_t q_rope_in_stride_h,
+    size_t q_nope_in_stride_n, size_t q_nope_in_stride_h, size_t q_rope_out_stride_n,
+    size_t q_rope_out_stride_h, size_t q_nope_out_stride_n, size_t q_nope_out_stride_h,
+    size_t k_rope_in_stride, size_t k_nope_in_stride, float quant_scale_q, float quant_scale_kv,
+    bool interleave, bool enable_pdl = false, cudaStream_t stream = nullptr) {
+  DISPATCH_ROPE_DIM(rope_dim, ROPE_DIM, {
+    DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
+      constexpr uint32_t vec_size = 32 / sizeof(DType);
+      constexpr uint32_t bdx = ROPE_DIM / vec_size;
+      uint32_t num_threads = 128U;
+      uint32_t bdy = num_threads / bdx;
+      uint32_t nblks_x = (nnz + bdy - 1) / bdy;
+      uint32_t rope_chunks = 1;
+      uint32_t no_rope_chunks = (no_rope_dim + rope_dim - 1) / rope_dim;
+
+      // MLA: Q rope + K rope + K nope + Q nope (no V)
+      constexpr uint32_t num_kv_heads = 1;
+      uint32_t total_blocks_y = num_qo_heads * rope_chunks + num_kv_heads * rope_chunks +
+                                num_kv_heads * no_rope_chunks + num_qo_heads * no_rope_chunks;
+
+      dim3 nblks(nblks_x, total_blocks_y);
+      dim3 nthrs(bdx, bdy);
+
+      cudaLaunchAttribute attribute[1];
+      attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+      attribute[0].val.programmaticStreamSerializationAllowed = enable_pdl ? 1 : 0;
+      cudaLaunchConfig_t config;
+      config.gridDim = nblks;
+      config.blockDim = nthrs;
+      config.stream = stream;
+      config.dynamicSmemBytes = 0;
+      config.attrs = attribute;
+      config.numAttrs = 1;
+
+      auto kernel =
+          RopeQuantizeAppendPagedKVCacheKernel<INTERLEAVE, vec_size, bdx, DType, IdType, QuantType,
+                                               paged_kv_mla_t<QuantType, IdType>>;
+      // For MLA: pass v_in as nullptr, num_kv_heads=1, duplicate 2D K strides for head strides, and
+      // 0 V strides
+      DType* v_in_nullptr = nullptr;
+      uint32_t num_kv_heads_1 = 1;
+      size_t k_rope_in_stride_h_dup = k_rope_in_stride;
+      size_t k_nope_in_stride_h_dup = k_nope_in_stride;
+      size_t v_in_stride_zero = 0, v_in_stride_h_zero = 0;
+      RopeQuantizeAppendPagedKVCacheParams params;
+      params.nnz = nnz;
+      params.num_qo_heads = num_qo_heads;
+      params.num_kv_heads = 1u;
+      params.rope_dim = rope_dim;
+      params.no_rope_dim = no_rope_dim;
+      params.q_rope_in_stride_n = q_rope_in_stride_n;
+      params.q_rope_in_stride_h = q_rope_in_stride_h;
+      params.q_nope_in_stride_n = q_nope_in_stride_n;
+      params.q_nope_in_stride_h = q_nope_in_stride_h;
+      params.q_rope_out_stride_n = q_rope_out_stride_n;
+      params.q_rope_out_stride_h = q_rope_out_stride_h;
+      params.q_nope_out_stride_n = q_nope_out_stride_n;
+      params.q_nope_out_stride_h = q_nope_out_stride_h;
+      params.k_rope_in_stride = k_rope_in_stride;
+      params.k_rope_in_stride_h = k_rope_in_stride_h_dup;
+      params.k_nope_in_stride = k_nope_in_stride;
+      params.k_nope_in_stride_h = k_nope_in_stride_h_dup;
+      params.v_in_stride = 0;
+      params.v_in_stride_h = 0;
+      params.quant_scale_q = quant_scale_q;
+      params.quant_scale_kv = quant_scale_kv;
+
+      FLASHINFER_CUDA_CALL(cudaLaunchKernelEx(&config, kernel,
+                                              // inputs
+                                              q_rope_in, k_rope_in, q_nope_in, k_nope_in,
+                                              v_in_nullptr,
+                                              // q outputs
+                                              q_rope_out, q_nope_out,
+                                              // cache + indices
+                                              paged_kv_mla, batch_indices, positions,
+                                              // rope tables
+                                              cos_sin_cache, pos_ids,
+                                              // params
+                                              params));
+    });
+  });
+
+  return cudaSuccess;
+}
+
 template <typename DType, typename IdType>
 cudaError_t BatchQKApplyRotaryPosIdsCosSinCache(
     DType* q, DType* k, DType* q_rope, DType* k_rope, float* cos_sin_cache, IdType* pos_ids,
diff --git a/include/flashinfer/sampling.cuh b/include/flashinfer/sampling.cuh
index 6b134630cf..03d4bfa8e2 100644
--- a/include/flashinfer/sampling.cuh
+++ b/include/flashinfer/sampling.cuh
@@ -249,27 +249,31 @@ __device__ __forceinline__ std::tuple<float, float> GetMinMaxValue(float* in_dat
                                                                    TempStorage& temp_storage) {
   const uint32_t tx = threadIdx.x;
   vec_t<float, VEC_SIZE> in_data_vec;
-  float max_val = -cuda::std::numeric_limits<float>::infinity(),
-        min_val = cuda::std::numeric_limits<float>::infinity();
+  // Thread-local min/max accumulation (deferred reduction)
+  float thread_max = -cuda::std::numeric_limits<float>::infinity();
+  float thread_min = cuda::std::numeric_limits<float>::infinity();
+
   for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
     in_data_vec.fill(0);
     if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
       in_data_vec.cast_load(in_data + row_idx * d + i * BLOCK_THREADS * VEC_SIZE + tx * VEC_SIZE);
     }
-    float in_data_[VEC_SIZE];
 #pragma unroll
     for (uint32_t j = 0; j < VEC_SIZE; ++j) {
-      in_data_[j] = in_data_vec[j];
+      thread_max = max(thread_max, static_cast<float>(in_data_vec[j]));
+      thread_min = min(thread_min, static_cast<float>(in_data_vec[j]));
     }
-    max_val = max(
-        max_val, BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
-                     .Reduce<VEC_SIZE>(in_data_, MaxReduceOp{}));
-    __syncthreads();
-    min_val = min(
-        min_val, BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
-                     .Reduce<VEC_SIZE>(in_data_, MinReduceOp{}));
-    __syncthreads();
   }
+
+  // Single block reduction after loop completes
+  float max_val =
+      BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
+          .Reduce(thread_max, MaxReduceOp{});
+  __syncthreads();
+  float min_val =
+      BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
+          .Reduce(thread_min, MinReduceOp{});
+
   if (tx == 0) {
     temp_storage.max_val = max_val;
     temp_storage.min_val = min_val;
@@ -288,22 +292,23 @@ __device__ __forceinline__ float GetMaxValue(float* in_data, uint32_t row_idx, u
   const uint32_t tx = threadIdx.x;
   vec_t<float, VEC_SIZE> in_data_vec;
 
-  float max_val = 0;
+  // Thread-local max accumulation (deferred reduction)
+  float thread_max = 0.0f;
   for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
     in_data_vec.fill(0);
     if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
       in_data_vec.cast_load(in_data + row_idx * d + (i * BLOCK_THREADS + tx) * VEC_SIZE);
     }
-    float in_data_[VEC_SIZE];
 #pragma unroll
     for (uint32_t j = 0; j < VEC_SIZE; ++j) {
-      in_data_[j] = in_data_vec[j];
+      thread_max = max(thread_max, static_cast<float>(in_data_vec[j]));
     }
-    max_val = max(
-        max_val, BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
-                     .template Reduce<VEC_SIZE>(in_data_, MaxReduceOp{}));
-    __syncthreads();
   }
+
+  // Single block reduction after loop completes
+  float max_val =
+      BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
+          .Reduce(thread_max, MaxReduceOp{});
   if (tx == 0) {
     temp_storage.max_val = max_val;
   }
@@ -333,6 +338,7 @@ __global__ void OnlineSoftmaxFusedKernel(DType* logits, DType* output, DType* te
 
   float running_max = -cuda::std::numeric_limits<float>::infinity();
   float running_denominator = 0.0f;
+  float threadlocal_running_denominator = 0.0f;
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
   asm volatile("griddepcontrol.wait;");
@@ -368,39 +374,32 @@ __global__ void OnlineSoftmaxFusedKernel(DType* logits, DType* output, DType* te
     }
     __syncthreads();
     block_max = temp_storage.shared_state.max_val;
-
     // if block_max is -inf, then this block contains all -inf values, so we can skip updating
     if (!isinf(block_max)) {
-      float thread_sum = 0.0f;
+      float threadlocal_sum = 0.0f;
 #pragma unroll
       for (uint32_t j = 0; j < VEC_SIZE; ++j) {
-        thread_sum += __expf(logits_vec[j] - block_max);
-      }
-
-      float block_sum =
-          cub::BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce).Sum(thread_sum);
-      __syncthreads();
-
-      if (tx == 0) {
-        float new_max = max(running_max, block_max);
-        running_denominator = running_denominator * __expf(running_max - new_max) +
-                              block_sum * __expf(block_max - new_max);
-        running_max = new_max;
-
-        temp_storage.shared_state.max_val = running_max;
-        temp_storage.shared_state.denominator = running_denominator;
+        threadlocal_sum += __expf(logits_vec[j] - block_max);
       }
-      __syncthreads();
-      running_max = temp_storage.shared_state.max_val;
-      running_denominator = temp_storage.shared_state.denominator;
+      float new_max = max(running_max, block_max);
+      threadlocal_running_denominator =
+          threadlocal_running_denominator * __expf(running_max - new_max) +
+          threadlocal_sum * __expf(block_max - new_max);
+      running_max = new_max;
     }
   }
 
+  running_denominator = cub::BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
+                            .Sum(threadlocal_running_denominator);
+  if (tx == 0) {
+    temp_storage.shared_state.denominator = running_denominator;
+  }
+  __syncthreads();
+  running_denominator = temp_storage.shared_state.denominator;
+
   const float final_max = running_max;
   const float inv_denominator = 1.0f / running_denominator;
 
-  __syncthreads();
-
   // Pass 2: Normalize in place
   vec_t<DType, VEC_SIZE> prob_vec;
   for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
@@ -458,6 +457,7 @@ __global__ void OnlineSoftmaxMapKernel(DType* logits, PartialSoftmaxResult* part
   vec_t<DType, VEC_SIZE> logits_vec;
   float running_max = -cuda::std::numeric_limits<float>::infinity();
   float running_denominator = 0.0f;
+  float threadlocal_running_denominator = 0.0f;
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
   asm volatile("griddepcontrol.wait;");
@@ -489,31 +489,27 @@ __global__ void OnlineSoftmaxMapKernel(DType* logits, PartialSoftmaxResult* part
 
     // if block_max is -inf, then this block contains all -inf values, so we can skip updating
     if (!isinf(block_max)) {
-      float thread_sum = 0.0f;
+      float threadlocal_sum = 0.0f;
 #pragma unroll
       for (uint32_t j = 0; j < VEC_SIZE; ++j) {
-        thread_sum += __expf(logits_vec[j] - block_max);
-      }
-
-      float block_sum =
-          cub::BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce).Sum(thread_sum);
-      __syncthreads();
-
-      if (tx == 0) {
-        float new_max = max(running_max, block_max);
-        running_denominator = running_denominator * __expf(running_max - new_max) +
-                              block_sum * __expf(block_max - new_max);
-        running_max = new_max;
-
-        temp_storage.shared_state.max_val = running_max;
-        temp_storage.shared_state.denominator = running_denominator;
+        threadlocal_sum += __expf(logits_vec[j] - block_max);
       }
-      __syncthreads();
-      running_max = temp_storage.shared_state.max_val;
-      running_denominator = temp_storage.shared_state.denominator;
+      float new_max = max(running_max, block_max);
+      threadlocal_running_denominator =
+          threadlocal_running_denominator * __expf(running_max - new_max) +
+          threadlocal_sum * __expf(block_max - new_max);
+      running_max = new_max;
     }
   }
 
+  running_denominator = cub::BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
+                            .Sum(threadlocal_running_denominator);
+  if (tx == 0) {
+    temp_storage.shared_state.denominator = running_denominator;
+  }
+  __syncthreads();
+  running_denominator = temp_storage.shared_state.denominator;
+
   if (tx == 0) {
     partial_results[bx * num_slices + by] = {running_max, running_denominator};
   }
@@ -887,6 +883,7 @@ __global__ void TopKSamplingFromProbKernel(DType* probs, IdType* output, IdType*
     double pivot_1 = (pivot_0 + high) / 2;
 
     ValueCount<float> aggregate_gt_pivot_0{0, 0}, aggregate_gt_pivot_1{0, 0};
+    ValueCount<float> threadlocal_gt_pivot_0{0, 0}, threadlocal_gt_pivot_1{0, 0};
 #pragma unroll 2
     for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
       probs_vec.fill(0);
@@ -903,26 +900,27 @@ __global__ void TopKSamplingFromProbKernel(DType* probs, IdType* output, IdType*
         probs_gt_pivot_1[j] = {
             (probs_vec[j] > pivot_1) ? probs_vec[j] : 0,
             (probs_vec[j] > pivot_1 && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
+        threadlocal_gt_pivot_0 += probs_gt_pivot_0[j];
+        threadlocal_gt_pivot_1 += probs_gt_pivot_1[j];
       }
+    }
+    aggregate_gt_pivot_0 += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                                temp_storage.block_prim.reduce_value_count)
+                                .Sum(threadlocal_gt_pivot_0);
+    if (tx == 0) {
+      temp_storage.block_aggregate.pair = aggregate_gt_pivot_0;
+    }
+    __syncthreads();
+    aggregate_gt_pivot_0 = temp_storage.block_aggregate.pair;
 
-      aggregate_gt_pivot_0 += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
-                                  temp_storage.block_prim.reduce_value_count)
-                                  .Sum<VEC_SIZE>(probs_gt_pivot_0);
-      if (tx == 0) {
-        temp_storage.block_aggregate.pair = aggregate_gt_pivot_0;
-      }
-      __syncthreads();
-      aggregate_gt_pivot_0 = temp_storage.block_aggregate.pair;
-
-      aggregate_gt_pivot_1 += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
-                                  temp_storage.block_prim.reduce_value_count)
-                                  .Sum<VEC_SIZE>(probs_gt_pivot_1);
-      if (tx == 0) {
-        temp_storage.block_aggregate.pair = aggregate_gt_pivot_1;
-      }
-      __syncthreads();
-      aggregate_gt_pivot_1 = temp_storage.block_aggregate.pair;
+    aggregate_gt_pivot_1 += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                                temp_storage.block_prim.reduce_value_count)
+                                .Sum(threadlocal_gt_pivot_1);
+    if (tx == 0) {
+      temp_storage.block_aggregate.pair = aggregate_gt_pivot_1;
     }
+    __syncthreads();
+    aggregate_gt_pivot_1 = temp_storage.block_aggregate.pair;
     if (aggregate_gt_pivot_0.count < k) {
       // case 1: pivot_0 accepted
       break;
@@ -1000,6 +998,8 @@ __global__ void TopPSamplingFromProbKernel(DType* probs, IdType* output, IdType*
     double pivot_1 = (pivot_0 + high) / 2;
 
     float aggregate_gt_pivot_0 = 0, aggregate_gt_pivot_1 = 0;
+    float threadlocal_aggregate_gt_pivot_0 = 0;
+    float threadlocal_aggregate_gt_pivot_1 = 0;
 #pragma unroll 2
     for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
       probs_vec.fill(0);
@@ -1012,24 +1012,26 @@ __global__ void TopPSamplingFromProbKernel(DType* probs, IdType* output, IdType*
       for (uint32_t j = 0; j < VEC_SIZE; ++j) {
         probs_gt_pivot_0[j] = (probs_vec[j] > pivot_0) ? probs_vec[j] : 0;
         probs_gt_pivot_1[j] = (probs_vec[j] > pivot_1) ? probs_vec[j] : 0;
+        threadlocal_aggregate_gt_pivot_0 += probs_gt_pivot_0[j];
+        threadlocal_aggregate_gt_pivot_1 += probs_gt_pivot_1[j];
       }
+    }
+    aggregate_gt_pivot_0 += BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
+                                .Sum(threadlocal_aggregate_gt_pivot_0);
+    if (tx == 0) {
+      temp_storage.block_aggregate.value = aggregate_gt_pivot_0;
+    }
+    __syncthreads();
+    aggregate_gt_pivot_0 = temp_storage.block_aggregate.value;
 
-      aggregate_gt_pivot_0 += BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
-                                  .template Sum<VEC_SIZE>(probs_gt_pivot_0);
-      if (tx == 0) {
-        temp_storage.block_aggregate.value = aggregate_gt_pivot_0;
-      }
-      __syncthreads();
-      aggregate_gt_pivot_0 = temp_storage.block_aggregate.value;
-
-      aggregate_gt_pivot_1 += BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
-                                  .template Sum<VEC_SIZE>(probs_gt_pivot_1);
-      if (tx == 0) {
-        temp_storage.block_aggregate.value = aggregate_gt_pivot_1;
-      }
-      __syncthreads();
-      aggregate_gt_pivot_1 = temp_storage.block_aggregate.value;
+    aggregate_gt_pivot_1 += BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
+                                .Sum(threadlocal_aggregate_gt_pivot_1);
+    if (tx == 0) {
+      temp_storage.block_aggregate.value = aggregate_gt_pivot_1;
     }
+    __syncthreads();
+    aggregate_gt_pivot_1 = temp_storage.block_aggregate.value;
+
     if (aggregate_gt_pivot_0 < top_p) {
       // case 1: pivot_0 accepted
       break;
@@ -1077,6 +1079,7 @@ __global__ void MinPSamplingFromProbKernel(DType* probs, float* min_p_arr, IdTyp
 
   vec_t<float, VEC_SIZE> probs_vec;
   float aggregate_gt_pivot = 0;
+  float threadlocal_aggregate_gt_pivot = 0;
 #pragma unroll 2
   for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
     probs_vec.fill(0);
@@ -1088,15 +1091,16 @@ __global__ void MinPSamplingFromProbKernel(DType* probs, float* min_p_arr, IdTyp
 #pragma unroll
     for (uint32_t j = 0; j < VEC_SIZE; ++j) {
       probs_gt_pivot[j] = (probs_vec[j] >= pivot) ? probs_vec[j] : 0;
+      threadlocal_aggregate_gt_pivot += probs_gt_pivot[j];
     }
+  }
 
-    aggregate_gt_pivot += BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
-                              .Sum<VEC_SIZE>(probs_gt_pivot);
-    if (tx == 0) {
-      temp_storage.block_aggregate.value = aggregate_gt_pivot;
-    }
-    __syncthreads();
+  aggregate_gt_pivot += BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
+                            .Sum(threadlocal_aggregate_gt_pivot);
+  if (tx == 0) {
+    temp_storage.block_aggregate.value = aggregate_gt_pivot;
   }
+  __syncthreads();
 
   float aggregate = 0;
   float q = temp_storage.block_aggregate.value;
@@ -1187,6 +1191,8 @@ __global__ void TopKTopPSamplingFromProbKernel(DType* probs, IdType* top_k_arr,
     double pivot_1 = (pivot_0 + high) / 2;
 
     ValueCount<float> aggregate_gt_pivot_0{0, 0}, aggregate_gt_pivot_1{0, 0};
+    ValueCount<float> threadlocal_aggregate_gt_pivot_0{0, 0};
+    ValueCount<float> threadlocal_aggregate_gt_pivot_1{0, 0};
 #pragma unroll 2
     for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
       probs_vec.fill(0);
@@ -1203,26 +1209,27 @@ __global__ void TopKTopPSamplingFromProbKernel(DType* probs, IdType* top_k_arr,
         probs_gt_pivot_1[j] = {
             (probs_vec[j] > pivot_1) ? probs_vec[j] : 0,
             (probs_vec[j] > pivot_1 && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
+        threadlocal_aggregate_gt_pivot_0 += probs_gt_pivot_0[j];
+        threadlocal_aggregate_gt_pivot_1 += probs_gt_pivot_1[j];
       }
+    }
+    aggregate_gt_pivot_0 +=
+        BlockReduce<ValueCount<float>, BLOCK_THREADS>(temp_storage.block_prim.reduce_value_count)
+            .Sum(threadlocal_aggregate_gt_pivot_0);
+    if (tx == 0) {
+      temp_storage.block_aggregate.pair = aggregate_gt_pivot_0;
+    }
+    __syncthreads();
+    aggregate_gt_pivot_0 = temp_storage.block_aggregate.pair;
 
-      aggregate_gt_pivot_0 +=
-          BlockReduce<ValueCount<float>, BLOCK_THREADS>(temp_storage.block_prim.reduce_value_count)
-              .Sum<VEC_SIZE>(probs_gt_pivot_0);
-      if (tx == 0) {
-        temp_storage.block_aggregate.pair = aggregate_gt_pivot_0;
-      }
-      __syncthreads();
-      aggregate_gt_pivot_0 = temp_storage.block_aggregate.pair;
-
-      aggregate_gt_pivot_1 +=
-          BlockReduce<ValueCount<float>, BLOCK_THREADS>(temp_storage.block_prim.reduce_value_count)
-              .Sum<VEC_SIZE>(probs_gt_pivot_1);
-      if (tx == 0) {
-        temp_storage.block_aggregate.pair = aggregate_gt_pivot_1;
-      }
-      __syncthreads();
-      aggregate_gt_pivot_1 = temp_storage.block_aggregate.pair;
+    aggregate_gt_pivot_1 +=
+        BlockReduce<ValueCount<float>, BLOCK_THREADS>(temp_storage.block_prim.reduce_value_count)
+            .Sum(threadlocal_aggregate_gt_pivot_1);
+    if (tx == 0) {
+      temp_storage.block_aggregate.pair = aggregate_gt_pivot_1;
     }
+    __syncthreads();
+    aggregate_gt_pivot_1 = temp_storage.block_aggregate.pair;
     if (aggregate_gt_pivot_0.count < k && aggregate_gt_pivot_0.value < p) {
       // case 1: pivot_0 accepted
       break;
@@ -1663,6 +1670,8 @@ __global__ void TopPRenormProbKernel(DType* probs, DType* renormed_prob, float*
     float aggregate_gt_pivot_0 = 0, aggregate_gt_pivot_1 = 0;
     min_gt_low = high;
     max_le_high = low;
+    float threadlocal_aggregate_gt_pivot_0 = 0;
+    float threadlocal_aggregate_gt_pivot_1 = 0;
 #pragma unroll 2
     for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
       probs_vec.fill(0);
@@ -1682,18 +1691,19 @@ __global__ void TopPRenormProbKernel(DType* probs, DType* renormed_prob, float*
         if (probs_vec[j] <= high && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
           max_le_high = max(max_le_high, probs_vec[j]);
         }
+        threadlocal_aggregate_gt_pivot_0 += probs_gt_pivot_0[j];
+        threadlocal_aggregate_gt_pivot_1 += probs_gt_pivot_1[j];
       }
-
-      aggregate_gt_pivot_0 +=
-          BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
-              .template Sum<VEC_SIZE>(probs_gt_pivot_0);
-      __syncthreads();
-
-      aggregate_gt_pivot_1 +=
-          BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
-              .template Sum<VEC_SIZE>(probs_gt_pivot_1);
-      __syncthreads();
     }
+    aggregate_gt_pivot_0 =
+        BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
+            .Sum(threadlocal_aggregate_gt_pivot_0);
+    __syncthreads();
+    aggregate_gt_pivot_1 =
+        BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
+            .Sum(threadlocal_aggregate_gt_pivot_1);
+    __syncthreads();
+
     min_gt_low = BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
                      .Reduce(min_gt_low, MinReduceOp{});
     __syncthreads();
@@ -1783,6 +1793,8 @@ __global__ void TopKMaskLogitsKernel(DType* logits, DType* masked_logits, IdType
       int aggregate_gt_pivot_0 = 0, aggregate_gt_pivot_1 = 0;
       min_gt_low = high;
       max_le_high = low;
+      int threadlocal_aggregate_gt_pivot_0 = 0;
+      int threadlocal_aggregate_gt_pivot_1 = 0;
 #pragma unroll 2
       for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
         logits_vec.fill(0);
@@ -1803,18 +1815,20 @@ __global__ void TopKMaskLogitsKernel(DType* logits, DType* masked_logits, IdType
           if (logits_vec[j] <= high && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
             max_le_high = max(max_le_high, logits_vec[j]);
           }
+          threadlocal_aggregate_gt_pivot_0 += probs_gt_pivot_0_count[j];
+          threadlocal_aggregate_gt_pivot_1 += probs_gt_pivot_1_count[j];
         }
+      }
+      aggregate_gt_pivot_0 +=
+          BlockReduce<int, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce_int)
+              .Sum(threadlocal_aggregate_gt_pivot_0);
+      __syncthreads();
 
-        aggregate_gt_pivot_0 +=
-            BlockReduce<int, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce_int)
-                .Sum<VEC_SIZE>(probs_gt_pivot_0_count);
-        __syncthreads();
+      aggregate_gt_pivot_1 +=
+          BlockReduce<int, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce_int)
+              .Sum(threadlocal_aggregate_gt_pivot_1);
+      __syncthreads();
 
-        aggregate_gt_pivot_1 +=
-            BlockReduce<int, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce_int)
-                .Sum<VEC_SIZE>(probs_gt_pivot_1_count);
-        __syncthreads();
-      }
       min_gt_low =
           BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
               .Reduce(min_gt_low, MinReduceOp{});
@@ -1901,6 +1915,8 @@ __global__ void TopKRenormProbKernel(DType* probs, DType* renormed_prob, IdType*
       ValueCount<float> aggregate_gt_pivot_0{0, 0}, aggregate_gt_pivot_1{0, 0};
       min_gt_low = high;
       max_le_high = low;
+      ValueCount<float> threadlocal_aggregate_gt_pivot_0{0, 0},
+          threadlocal_aggregate_gt_pivot_1{0, 0};
 #pragma unroll 1
       for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
         probs_vec.fill(0);
@@ -1923,18 +1939,20 @@ __global__ void TopKRenormProbKernel(DType* probs, DType* renormed_prob, IdType*
           if (probs_vec[j] <= high && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
             max_le_high = max(max_le_high, probs_vec[j]);
           }
+          threadlocal_aggregate_gt_pivot_0 += probs_gt_pivot_0_pair[j];
+          threadlocal_aggregate_gt_pivot_1 += probs_gt_pivot_1_pair[j];
         }
+      }
+      aggregate_gt_pivot_0 += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                                  temp_storage.block_prim.reduce_value_count)
+                                  .Sum(threadlocal_aggregate_gt_pivot_0);
+      __syncthreads();
 
-        aggregate_gt_pivot_0 += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
-                                    temp_storage.block_prim.reduce_value_count)
-                                    .template Sum<VEC_SIZE>(probs_gt_pivot_0_pair);
-        __syncthreads();
+      aggregate_gt_pivot_1 += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                                  temp_storage.block_prim.reduce_value_count)
+                                  .Sum(threadlocal_aggregate_gt_pivot_1);
+      __syncthreads();
 
-        aggregate_gt_pivot_1 += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
-                                    temp_storage.block_prim.reduce_value_count)
-                                    .template Sum<VEC_SIZE>(probs_gt_pivot_1_pair);
-        __syncthreads();
-      }
       min_gt_low =
           BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
               .Reduce(min_gt_low, MinReduceOp{});
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmEnums.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmEnums.h
index 27955d2bdc..919d6cb00d 100644
--- a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmEnums.h
+++ b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmEnums.h
@@ -31,7 +31,9 @@ enum class RouteImpl {
   // Use LDGSTS to do the routing
   Ldgsts = 1,
   // Use UTMALDG.GATHER4 to do the routing
-  Tma = 2
+  Tma = 2,
+  // Use LDG+STS to do the routing
+  LdgPlusSts = 3
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -48,6 +50,10 @@ inline bool doesRouteImplUseTma(RouteImpl mode) { return (mode == RouteImpl::Tma
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+inline bool doesRouteImplUseLdgPlusSts(RouteImpl mode) { return (mode == RouteImpl::LdgPlusSts); }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 }  // namespace batchedGemm
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h
index 6b1f910178..22d60fb4bd 100644
--- a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h
+++ b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h
@@ -18,24 +18,19 @@
 
 #include <numeric>
 #include <optional>
+#include <unordered_map>
 
 #include "BatchedGemmOptions.h"
 #include "KernelParams.h"
 #include "trtllm/gen/CudaKernelLauncher.h"
 
 #ifdef TLLM_GEN_EXPORT_INTERFACE
+#ifdef TLLM_GEN_EXPORT_FLASHINFER
 #include "flashinferMetaInfo.h"
-#endif  // TLLM_GEN_EXPORT_INTERFACE
-
-#ifdef TLLM_GEN_BMM_CUBIN_PATH
-static const std::string tllm_gen_bmm_cubin_path = std::string(TLLM_GEN_BMM_CUBIN_PATH);
 #else
-static_assert(false, "TLLM_GEN_BMM_CUBIN_PATH macro is not defined when compiling");
-#endif
-
-namespace flashinfer::trtllm_cubin_loader {
-std::string getCubin(const std::string& kernelName, const std::string& sha256);
-}
+#include "KernelMetaInfo.h"
+#endif  // TLLM_GEN_EXPORT_FLASHINFER
+#endif  // TLLM_GEN_EXPORT_INTERFACE
 
 namespace batchedGemm {
 
@@ -79,13 +74,18 @@ struct BatchedGemmData {
     // The M dimension.
     // It is the total number of tokens if A is the activation matrix.
     // It is the total number of output channels if A is the weight matrix.
+    // ValidM/N/K by default assumes to be full range of M/N/K respectively. If we pad M/N/K due to
+    // alignment of other constraints, then we can specify ValidM/N/K to indicate the valid range.
     int32_t mM{0};
+    int32_t mValidM{0};
     // The N dimension.
     // It is the total number of tokens if B is the activation matrix.
     // It is the total number of output channels if B is the weight matrix.
     int32_t mN{0};
+    int32_t mValidN{0};
     // The K dimension. It is the hidden dimension of the input matrices.
     int32_t mK{0};
+    int32_t mValidK{0};
     // The rank id of the current device in the multi-gpu space.
     int32_t mRank{0};
     // The number of devices in tensor-parallel group.
@@ -123,7 +123,7 @@ struct BatchedGemmData {
     //      Otherwise, shape is [M / 128, K / 128].
     //    The rightmost dimension is contiguous in memory.
     //
-    //   If DeepSeek FP8 recipe is not used, but for MxFp{4,8} and NvFp4 formats:
+    //   If DeepSeek FP8 recipe is not used, but for MxFp{4,8}, MxInt4 and NvFp4 formats:
     //      The layout of scaling factors for A is always R128c4
     //      M must be a multiple of 128.
     //      K must be a multiple of 64.
@@ -133,7 +133,8 @@ struct BatchedGemmData {
     //  Where paddedM is M if (routeAct == true && batchM), or
     //  sum(divUpMul(M[bi], tileM) for bi in B) if batchM,
     //  otherwise divUpMul(M, tileM) * B.
-    //  Dtype is Dtype::Fp32 if DeepSeek FP8 recipe is used, otherwise Dtype::E4m3.
+    //  Dtype is Dtype::Fp32 if DeepSeek FP8 recipe is used, otherwise Dtype is Dtype::E4m3 for
+    //  NvFp4, Dtype::UE8m0 for MxFp{4,8} formats, Dtype::Bfloat16 for MxInt4.
     //
     // Otherwise should be set to nullptr.
     void const* mPtrSfA{nullptr};
@@ -457,28 +458,187 @@ class BatchedGemmInterface {
  public:
   using ModuleCache = std::unordered_map<std::string, std::tuple<CUmodule, CUfunction>>;
 
-  BatchedGemmInterface() {}
+  //////////////////////////////////////////////////////////////////////////////////////////////////
+
+  BatchedGemmInterface(bool const exportsCubin = false, int32_t const numRotations = 1)
+      : mExportsCubin(exportsCubin), mNumRotations(numRotations) {}
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef TLLM_GEN_EXPORT_INTERFACE
+  // Generates and compiles the kernel using either nvcc or nvrtc.
+  BatchedGemmConfig generateAndCompileKernel(BatchedGemmConfig const& batchedGemmConfig) const;
+#endif
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////
 
   // Launch the cubin from the provided config. It calls all necessary memsets for internal buffers.
   // Provided config must be validated with isValidConfig before the call.
-  int32_t run(BatchedGemmConfig const& config, void* workspace, BatchedGemmData const& options,
-              void* cudaStream, int32_t multiProcessorCount, bool usePdl = true,
-              std::optional<std::reference_wrapper<ModuleCache>> moduleCache = std::nullopt);
+  int32_t run(BatchedGemmConfig const& config, void* workspace,
+              BatchedGemmData const& batchedGemmData, void* cudaStream,
+              int32_t /*multiProcessorCount*/, bool usePdl = true,
+              std::optional<std::reference_wrapper<ModuleCache>> moduleCache = std::nullopt) {
+    // Get options from config and data.
+    auto options = getOptionsFromConfigAndData(config, batchedGemmData);
+
+    bool const batchM = options.mBatchMode == BatchedGemmOptions::BatchMode::BatchM;
+    bool const useDeepSeekFp8 = options.mUseDeepSeekFp8 && options.mDtypeA == tg::Dtype::E4m3 &&
+                                options.mDtypeB == tg::Dtype::E4m3;
+
+    auto workspaceSizes = getWorkspaceSizesInBytes(config, batchedGemmData);
+    float* dPtrRowMax{nullptr};
+    uint32_t* dPtrRowMaxBars{nullptr};
+
+    // Set the completion barriers to 0 if needed.
+    if (useDeepSeekFp8 && options.mFusedAct) {
+      dPtrRowMax = reinterpret_cast<float*>(alignPtr(reinterpret_cast<char*>(workspace), 1024));
+      dPtrRowMaxBars = reinterpret_cast<uint32_t*>(
+          alignPtr(reinterpret_cast<char*>(dPtrRowMax) + workspaceSizes[0], 1024));
+      auto err = cudaMemsetAsync((void*)dPtrRowMaxBars, 0x00, workspaceSizes[1],
+                                 reinterpret_cast<cudaStream_t>(cudaStream));
+      if (err != cudaSuccess) {
+        return 1;
+      }
+    }
+
+    auto [numCtaBatch, numCtaTile, numCtaInner] =
+        getGridDim(options, batchedGemmData.mProblemDimensions.mMaxNumCtasInTokenDim);
+    auto kernelParams = KernelParamsSetup::setKernelParams(
+        options, batchM, batchedGemmData.mInputBuffers.mPtrA, batchedGemmData.mInputBuffers.mPtrB,
+        batchedGemmData.mOutputBuffers.mPtrC, batchedGemmData.mInputBuffers.mPtrSfA,
+        batchedGemmData.mInputBuffers.mPtrSfB, batchedGemmData.mInputBuffers.mPtrPerTokenSfA,
+        batchedGemmData.mInputBuffers.mPtrPerTokenSfB, batchedGemmData.mInputBuffers.mPtrBias,
+        batchedGemmData.mOutputBuffers.mPtrSfC, batchedGemmData.mInputBuffers.mPtrScaleC,
+        batchedGemmData.mInputBuffers.mPtrScaleGate, batchedGemmData.mInputBuffers.mPtrClampLimit,
+        batchedGemmData.mInputBuffers.mPtrGatedActAlpha,
+        batchedGemmData.mInputBuffers.mPtrGatedActBeta, batchedGemmData.mInputBuffers.mPtrRouteMap,
+        dPtrRowMax, dPtrRowMaxBars, batchedGemmData.mInputBuffers.mPtrNumNonExitingCtas,
+        batchedGemmData.mInputBuffers.mPtrTotalNumPaddedTokens,
+        batchedGemmData.mInputBuffers.mPtrCtaIdxXyToBatchIdx,
+        batchedGemmData.mInputBuffers.mPtrCtaIdxXyToMnLimit, numCtaBatch);
+
+    // The size of the grid.
+    std::vector<int32_t> grid = batchM ? std::vector<int32_t>{numCtaBatch, numCtaTile, numCtaInner}
+                                       : std::vector<int32_t>{numCtaTile, numCtaBatch, numCtaInner};
+
+    BatchedGemmConfig batchedGemmConfig = config;
+#ifndef TLLM_GEN_EXPORT_INTERFACE
+    // Generate and compile the kernel if data is not provided.
+    if (config.mData == nullptr) {
+      batchedGemmConfig = generateAndCompileKernel(batchedGemmConfig);
+    }
+    TLLM_CHECK_ERROR(batchedGemmConfig.mCudaRunner != nullptr, "CudaRunner is not set");
+    batchedGemmConfig.mCudaRunner->run((void*)&kernelParams, (void*)cudaStream, grid,
+                                       /* cluster */ {},
+                                       /* instanceId */ batchedGemmConfig.mInstanceIdx);
+    return 0;
+#endif
+
+    CUmodule cuModule;
+    CUfunction cuFunction;
+
+    if (moduleCache.has_value()) {
+      ModuleCache& moduleCacheRef = moduleCache.value().get();
+
+      // Modules are associated with a specific context, so the context is included in the key
+      CUcontext ctx;
+      unsigned long long ctxId;
+      cuCtxGetCurrent(&ctx);
+      cuCtxGetId(ctx, &ctxId);
+
+      // Reinterpret the ctxId as a string to avoid needing a custom hash or converting it to a
+      // string in decimal representation.
+      std::string const ctxName =
+          std::string(reinterpret_cast<char*>(&ctxId), sizeof(unsigned long long) / sizeof(char));
+      std::string const funcName = std::string(batchedGemmConfig.mFunctionName);
+      auto const moduleKey = ctxName + funcName;
+      auto module = moduleCacheRef.find(moduleKey);
+
+      // Use cache if module is found, otherwise load and insert into cache
+      if (module != moduleCacheRef.end()) {
+        cuFunction = std::get<1>(module->second);
+      } else {
+        gemm::loadCubinData(&cuModule, batchedGemmConfig);
+        cuModuleGetFunction(&cuFunction, cuModule, batchedGemmConfig.mFunctionName);
+        moduleCacheRef.insert(std::make_pair(moduleKey, std::make_tuple(cuModule, cuFunction)));
+      }
+    } else {
+      gemm::loadCubinData(&cuModule, batchedGemmConfig);
+      cuModuleGetFunction(&cuFunction, cuModule, batchedGemmConfig.mFunctionName);
+    }
+
+    // Prepare the grid/block.
+    dim3 block3{static_cast<uint32_t>(batchedGemmConfig.mNumThreadsPerCTA),
+                static_cast<uint32_t>(1), static_cast<uint32_t>(1)};
+    dim3 grid3{(grid.size() > 0 ? static_cast<uint32_t>(grid[0]) : 1u),
+               (grid.size() > 1 ? static_cast<uint32_t>(grid[1]) : 1u),
+               (grid.size() > 2 ? static_cast<uint32_t>(grid[2]) : 1u)};
+    // Prepare the cluster size.
+    dim3 cluster3{static_cast<uint32_t>(options.mClusterDimX),
+                  static_cast<uint32_t>(options.mClusterDimY),
+                  static_cast<uint32_t>(options.mClusterDimZ)};
+
+    // Whether PDL can safely be enabled
+    const bool pdlSafe = batchedGemmConfig.mOptions.mGridWaitForPrimaryRouting ||
+                         batchedGemmConfig.mOptions.mGridWaitForPrimaryEarlyExit ||
+                         batchedGemmConfig.mOptions.mGridWaitForPrimaryA ||
+                         batchedGemmConfig.mOptions.mGridWaitForPrimaryB;
+
+    // Run the kernel.
+    auto result = trtllm::gen::launchKernel((void*)&kernelParams, cudaStream,
+                                            batchedGemmConfig.mSharedMemSize, cuFunction, block3,
+                                            grid3, cluster3, usePdl && pdlSafe);
+    if (result != CUDA_SUCCESS) {
+      return result;
+    }
+    // If a module cache has not been given, unload the module to avoid leaking
+    if (!moduleCache.has_value()) {
+      cuModuleUnload(cuModule);
+    }
+    return 0;
+  }
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////
 
   // Initializes the buffers before the world sync. Must be called before run.
   int32_t runInitBeforeWorldSync(BatchedGemmConfig const& /* config */,
                                  BatchedGemmData const& /* data */, void* /* cudaStream */) const {
     return 0;
-  };
+  }
 
-  size_t getWorkspaceSizeInBytes(BatchedGemmConfig const& /* config */,
-                                 BatchedGemmData const& /* data */) const;
+  //////////////////////////////////////////////////////////////////////////////////////////////////
+
+  size_t getWorkspaceSizeInBytes(BatchedGemmConfig const& config,
+                                 BatchedGemmData const& data) const {
+    auto workspaceSizes = getWorkspaceSizesInBytes(config, data);
+    auto size = std::accumulate(workspaceSizes.begin(), workspaceSizes.end(), 0);
+    // Additional 1023 bytes to align the pointer to 1024
+    return size > 0 ? size + 1023 : 0;
+  }
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////
 
   // Returns the list of all available cubin configurations
-  BatchedGemmConfig const* getBatchedGemmConfigs() const;
+  BatchedGemmConfig const* getBatchedGemmConfigs() const {
+#ifdef TLLM_GEN_EXPORT_INTERFACE
+    return tensorrt_llm::kernels::tllmGenBatchedGemmList;
+#else
+    return nullptr;
+#endif
+  }
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////
 
   // Returns the number of available cubin configurations
-  size_t getNumBatchedGemmConfigs() const;
+  size_t getNumBatchedGemmConfigs() const {
+#ifdef TLLM_GEN_EXPORT_INTERFACE
+    return tensorrt_llm::kernels::tllmGenBatchedGemmListLen;
+#else
+    return 0;
+#endif
+  }
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////
 
   // Returns the grid dimensions of the current kernel.
   std::tuple<int32_t, int32_t, int32_t> getGridDim(
@@ -523,6 +683,8 @@ class BatchedGemmInterface {
     return std::make_tuple(numCtasBatch, numCtasTile, numCtasInner);
   }
 
+  //////////////////////////////////////////////////////////////////////////////////////////////////
+
   // Returns the number of CTAs of the current kernel.
   int32_t getNumCtas(BatchedGemmOptions const& options,
                      std::optional<int32_t> maxNumCtasInBatchDim = std::nullopt) const {
@@ -530,278 +692,114 @@ class BatchedGemmInterface {
     return numCtasBatch * numCtasTile * numCtasInner;
   }
 
-  // Returns true if the configuration of the cubin can be executed for the given params.
-  bool isValidConfig(BatchedGemmConfig const& config, BatchedGemmData const& data) const;
+  //////////////////////////////////////////////////////////////////////////////////////////////////
 
   // Creates GemmOptions from kernel and data.
   BatchedGemmOptions getOptionsFromConfigAndData(BatchedGemmConfig const& config,
-                                                 BatchedGemmData const& data) const;
-
- private:
-  // Aligns the pointer to the alignment
-  template <typename Dtype>
-  inline Dtype* alignPtr(Dtype* ptr, int64_t alignment) const;
-
-  // Returns the size of the workspace buffers in bytes
-  std::vector<size_t> getWorkspaceSizesInBytes(BatchedGemmConfig const& config,
-                                               BatchedGemmData const& data) const;
-
-  // Returns the size padded to the alignment
-  size_t getSizePaddedToAlignment(size_t size, size_t alignment) const;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Dtype>
-inline Dtype* BatchedGemmInterface::alignPtr(Dtype* ptr, int64_t alignment) const {
-  assert((alignment & (alignment - 1)) == 0 && "Alignment must be a power of 2");
-  return reinterpret_cast<Dtype*>((reinterpret_cast<uintptr_t>(ptr) + alignment - 1) &
-                                  ~(alignment - 1));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-BatchedGemmConfig const* BatchedGemmInterface::getBatchedGemmConfigs() const {
-#ifdef TLLM_GEN_EXPORT_INTERFACE
-  return tensorrt_llm::kernels::tllmGenBatchedGemmList;
-#else
-  return nullptr;
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-size_t BatchedGemmInterface::getNumBatchedGemmConfigs() const {
-#ifdef TLLM_GEN_EXPORT_INTERFACE
-  return sizeof(tensorrt_llm::kernels::tllmGenBatchedGemmList) /
-         sizeof(tensorrt_llm::kernels::tllmGenBatchedGemmList[0]);
-#else
-  return 0;
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-BatchedGemmOptions BatchedGemmInterface::getOptionsFromConfigAndData(
-    BatchedGemmConfig const& config, BatchedGemmData const& data) const {
-  // Create options from config and data.
-  BatchedGemmOptions options;
-  options = config.mOptions;
-  options.mM = data.mProblemDimensions.mM;
-  options.mN = data.mProblemDimensions.mN;
-  options.mK = data.mProblemDimensions.mK;
-  options.mBatchedM = data.mProblemDimensions.mBatchedM;
-  options.mBatchedN = data.mProblemDimensions.mBatchedN;
-  options.mBatchMode = data.mProblemDimensions.mBatchM ? BatchedGemmOptions::BatchMode::BatchM
-                                                       : BatchedGemmOptions::BatchMode::BatchN;
-  options.mNumBatches = data.mProblemDimensions.mNumBatches;
-  options.mNumTokens = data.mProblemDimensions.mNumTokens;
-  return options;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-bool BatchedGemmInterface::isValidConfig(BatchedGemmConfig const& config,
-                                         BatchedGemmData const& data) const {
-  // Get options from config and data.
-  auto options = getOptionsFromConfigAndData(config, data);
-
-  // Is Blackwell?
-  bool isBlackwell = gemm::isSmVersionBlackwell(config.mSm);
-
-  // Check options without modifications.
-  return checkAndUpdateBatchedGemmOptions(options, isBlackwell,
-                                          /* updateOptions */ false);
-}
+                                                 BatchedGemmData const& data) const {
+    BatchedGemmOptions options;
+    options = config.mOptions;
+    options.mM = data.mProblemDimensions.mM;
+    options.mN = data.mProblemDimensions.mN;
+    options.mK = data.mProblemDimensions.mK;
+    options.mValidM = data.mProblemDimensions.mValidM;
+    options.mValidN = data.mProblemDimensions.mValidN;
+    options.mValidK = data.mProblemDimensions.mValidK;
+    options.mBatchedM = data.mProblemDimensions.mBatchedM;
+    options.mBatchedN = data.mProblemDimensions.mBatchedN;
+    options.mBatchMode = data.mProblemDimensions.mBatchM ? BatchedGemmOptions::BatchMode::BatchM
+                                                         : BatchedGemmOptions::BatchMode::BatchN;
+    options.mNumBatches = data.mProblemDimensions.mNumBatches;
+    options.mNumTokens = data.mProblemDimensions.mNumTokens;
+    return options;
+  }
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
+  //////////////////////////////////////////////////////////////////////////////////////////////////
 
-size_t BatchedGemmInterface::getSizePaddedToAlignment(size_t size, size_t alignment) const {
-  assert((alignment & (alignment - 1)) == 0);
-  return (size + alignment - 1) & ~(alignment - 1);
-}
+  // Returns true if the configuration of the cubin can be executed for the given params.
+  bool isValidConfig(BatchedGemmConfig const& config, BatchedGemmData const& data) const {
+    // Get options from config and data.
+    auto options = getOptionsFromConfigAndData(config, data);
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Check options without modifications.
+    return checkAndUpdateBatchedGemmOptions(options, config.mSm,
+                                            /* updateOptions */ false);
+  }
 
-size_t BatchedGemmInterface::getWorkspaceSizeInBytes(BatchedGemmConfig const& config,
-                                                     BatchedGemmData const& data) const {
-  auto workspaceSizes = getWorkspaceSizesInBytes(config, data);
-  auto size = std::accumulate(workspaceSizes.begin(), workspaceSizes.end(), 0);
-  // Additional 1023 bytes to align the pointer to 1024
-  return size > 0 ? size + 1023 : 0;
-}
+  //////////////////////////////////////////////////////////////////////////////////////////////////
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
+ private:
+  //////////////////////////////////////////////////////////////////////////////////////////////////
 
-std::vector<size_t> BatchedGemmInterface::getWorkspaceSizesInBytes(
-    BatchedGemmConfig const& config, BatchedGemmData const& data) const {
-  std::vector<size_t> workspaceSizes;
+  template <typename Dtype>
+  inline Dtype* alignPtr(Dtype* ptr, int64_t alignment) const {
+    assert((alignment & (alignment - 1)) == 0 && "Alignment must be a power of 2");
+    return reinterpret_cast<Dtype*>((reinterpret_cast<uintptr_t>(ptr) + alignment - 1) &
+                                    ~(alignment - 1));
+  }
 
-  // Get options from config and data.
-  auto options = getOptionsFromConfigAndData(config, data);
+  //////////////////////////////////////////////////////////////////////////////////////////////////
 
-  if (options.mUseDeepSeekFp8 && options.mFusedAct) {
-    int32_t totalNumPaddedTokens = 0;
-    auto const batchM = options.mBatchMode == BatchedGemmOptions::BatchMode::BatchM;
-    if (!options.mEnablesEarlyExit || options.mNumTokens == 0) {
-      for (int32_t bi = 0; bi < options.mNumBatches; ++bi) {
-        totalNumPaddedTokens += batchM ? gemm::divUpMul(options.mBatchedM[bi], options.mTileM)
-                                       : gemm::divUpMul(options.mBatchedN[bi], options.mTileN);
+  // Returns the size of the workspace buffers in bytes
+  std::vector<size_t> getWorkspaceSizesInBytes(BatchedGemmConfig const& config,
+                                               BatchedGemmData const& data) const {
+    std::vector<size_t> workspaceSizes;
+
+    // Get options from config and data.
+    auto options = getOptionsFromConfigAndData(config, data);
+
+    if (options.mUseDeepSeekFp8 && options.mFusedAct) {
+      int32_t totalNumPaddedTokens = 0;
+      auto const batchM = options.mBatchMode == BatchedGemmOptions::BatchMode::BatchM;
+      if (!options.mEnablesEarlyExit || options.mNumTokens == 0) {
+        for (int32_t bi = 0; bi < options.mNumBatches; ++bi) {
+          totalNumPaddedTokens += batchM ? gemm::divUpMul(options.mBatchedM[bi], options.mTileM)
+                                         : gemm::divUpMul(options.mBatchedN[bi], options.mTileN);
+        }
+      } else {
+        // Get tile in token dim.
+        auto tileTokensDim = batchM ? options.mTileM : options.mTileN;
+        totalNumPaddedTokens = data.mProblemDimensions.mMaxNumCtasInTokenDim * tileTokensDim;
       }
-    } else {
-      // Get tile in token dim.
-      auto tileTokensDim = batchM ? options.mTileM : options.mTileN;
-      totalNumPaddedTokens = data.mProblemDimensions.mMaxNumCtasInTokenDim * tileTokensDim;
-    }
-
-    // Get options from config.
-    auto& options = config.mOptions;
-
-    int const tokenTile = batchM ? options.mTileM : options.mTileN;
 
-    auto const numTokens = totalNumPaddedTokens;
-    auto const intermediateDim = batchM ? options.mN : options.mM;
-    auto const intermediateTile = batchM ? options.mTileN : options.mTileM;
+      // Get options from config.
+      auto& options = config.mOptions;
 
-    auto const numBytesRowMax = intermediateDim * totalNumPaddedTokens / 128 * sizeof(float);
-
-    auto const numTilesToken = numTokens / tokenTile;
-    auto const numTilesInt = intermediateDim / intermediateTile;
-    auto const numBytesRowMaxBars = numTilesToken * numTilesInt / 2 * sizeof(uint32_t);
-
-    // TODO: do we need to pad to 1024?
-    workspaceSizes.push_back(getSizePaddedToAlignment(numBytesRowMax, 1024));
-    workspaceSizes.push_back(getSizePaddedToAlignment(numBytesRowMaxBars, 1024));
-  }
+      int const tokenTile = batchM ? options.mTileM : options.mTileN;
 
-  return workspaceSizes;
-}
+      auto const numTokens = totalNumPaddedTokens;
+      auto const intermediateDim = batchM ? options.mN : options.mM;
+      auto const intermediateTile = batchM ? options.mTileN : options.mTileM;
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-int32_t BatchedGemmInterface::run(BatchedGemmConfig const& config, void* workspace,
-                                  BatchedGemmData const& batchedGemmData, void* cudaStream,
-                                  int32_t /* multiProcessorCount */, bool usePdl,
-                                  std::optional<std::reference_wrapper<ModuleCache>> moduleCache) {
-  // Might be used.
-  (void)usePdl;
-  (void)moduleCache;
-  // Get options from config and data.
-  auto options = getOptionsFromConfigAndData(config, batchedGemmData);
-
-  bool const batchM = options.mBatchMode == BatchedGemmOptions::BatchMode::BatchM;
-  bool const useDeepSeekFp8 = options.mUseDeepSeekFp8 && options.mDtypeA == tg::Dtype::E4m3 &&
-                              options.mDtypeB == tg::Dtype::E4m3;
-
-  auto workspaceSizes = getWorkspaceSizesInBytes(config, batchedGemmData);
-  float* dPtrRowMax{nullptr};
-  uint32_t* dPtrRowMaxBars{nullptr};
-
-  // Set the completion barriers to 0 if needed.
-  if (useDeepSeekFp8 && options.mFusedAct) {
-    dPtrRowMax = reinterpret_cast<float*>(alignPtr(reinterpret_cast<char*>(workspace), 1024));
-    dPtrRowMaxBars = reinterpret_cast<uint32_t*>(
-        alignPtr(reinterpret_cast<char*>(dPtrRowMax) + workspaceSizes[0], 1024));
-    auto err = cudaMemsetAsync((void*)dPtrRowMaxBars, 0x00, workspaceSizes[1],
-                               reinterpret_cast<cudaStream_t>(cudaStream));
-    if (err != cudaSuccess) {
-      return 1;
-    }
-  }
+      auto const numBytesRowMax = intermediateDim * totalNumPaddedTokens / 128 * sizeof(float);
 
-  auto [numCtaBatch, numCtaTile, numCtaInner] =
-      getGridDim(options, batchedGemmData.mProblemDimensions.mMaxNumCtasInTokenDim);
-  auto kernelParams = KernelParamsSetup::setKernelParams(
-      options, batchM, batchedGemmData.mInputBuffers.mPtrA, batchedGemmData.mInputBuffers.mPtrB,
-      batchedGemmData.mOutputBuffers.mPtrC, batchedGemmData.mInputBuffers.mPtrSfA,
-      batchedGemmData.mInputBuffers.mPtrSfB, batchedGemmData.mInputBuffers.mPtrPerTokenSfA,
-      batchedGemmData.mInputBuffers.mPtrPerTokenSfB, batchedGemmData.mInputBuffers.mPtrBias,
-      batchedGemmData.mOutputBuffers.mPtrSfC, batchedGemmData.mInputBuffers.mPtrScaleC,
-      batchedGemmData.mInputBuffers.mPtrScaleGate, batchedGemmData.mInputBuffers.mPtrClampLimit,
-      batchedGemmData.mInputBuffers.mPtrGatedActAlpha,
-      batchedGemmData.mInputBuffers.mPtrGatedActBeta, batchedGemmData.mInputBuffers.mPtrRouteMap,
-      dPtrRowMax, dPtrRowMaxBars, batchedGemmData.mInputBuffers.mPtrNumNonExitingCtas,
-      batchedGemmData.mInputBuffers.mPtrTotalNumPaddedTokens,
-      batchedGemmData.mInputBuffers.mPtrCtaIdxXyToBatchIdx,
-      batchedGemmData.mInputBuffers.mPtrCtaIdxXyToMnLimit, numCtaBatch);
-
-  // The size of the grid.
-  std::vector<int32_t> grid = batchM ? std::vector<int32_t>{numCtaBatch, numCtaTile, numCtaInner}
-                                     : std::vector<int32_t>{numCtaTile, numCtaBatch, numCtaInner};
+      auto const numTilesToken = numTokens / tokenTile;
+      auto const numTilesInt = intermediateDim / intermediateTile;
+      auto const numBytesRowMaxBars = numTilesToken * numTilesInt / 2 * sizeof(uint32_t);
 
-#ifdef TLLM_GEN_EXPORT_INTERFACE
-  CUmodule cuModule;
-  CUfunction cuFunction;
-
-  auto fiModuleLoadData = [&](CUmodule* module) {
-    const std::string sha256 = config.mHash ? config.mHash : "";
-    std::string fname_cubin = config.mFunctionName;
-    if (!fname_cubin.empty()) {
-      fname_cubin[0] = static_cast<char>(std::toupper(static_cast<unsigned char>(fname_cubin[0])));
+      // TODO: do we need to pad to 1024?
+      workspaceSizes.push_back(getSizePaddedToAlignment(numBytesRowMax, 1024));
+      workspaceSizes.push_back(getSizePaddedToAlignment(numBytesRowMaxBars, 1024));
     }
-    fname_cubin = tllm_gen_bmm_cubin_path + "/" + fname_cubin + ".cubin";
-    std::string cubin = flashinfer::trtllm_cubin_loader::getCubin(fname_cubin, sha256);
-    cuModuleLoadData(&cuModule, cubin.c_str());
-  };
 
-  if (moduleCache.has_value()) {
-    ModuleCache& moduleCacheRef = moduleCache.value().get();
-
-    // Modules are associated with a specific context, so the context is included in the key
-    CUcontext ctx;
-    unsigned long long ctxId;
-    cuCtxGetCurrent(&ctx);
-    cuCtxGetId(ctx, &ctxId);
-
-    // Reinterpret the ctxId as a string to avoid needing a custom hash or converting it to a
-    // string in decimal representation.
-    std::string const ctxName =
-        std::string(reinterpret_cast<char*>(&ctxId), sizeof(unsigned long long) / sizeof(char));
-    std::string const funcName = std::string(config.mFunctionName);
-    auto const moduleKey = ctxName + funcName;
-    auto module = moduleCacheRef.find(moduleKey);
-
-    // Use cache if module is found, otherwise load and insert into cache
-    if (module != moduleCacheRef.end()) {
-      cuFunction = std::get<1>(module->second);
-    } else {
-      fiModuleLoadData(&cuModule);
-      cuModuleGetFunction(&cuFunction, cuModule, config.mFunctionName);
-      moduleCacheRef.insert(std::make_pair(moduleKey, std::make_tuple(cuModule, cuFunction)));
-    }
-  } else {
-    fiModuleLoadData(&cuModule);
-    cuModuleGetFunction(&cuFunction, cuModule, config.mFunctionName);
+    return workspaceSizes;
   }
 
-  // Prepare the grid/block.
-  dim3 block3{static_cast<uint32_t>(config.mNumThreadsPerCTA), static_cast<uint32_t>(1),
-              static_cast<uint32_t>(1)};
-  dim3 grid3{(grid.size() > 0 ? static_cast<uint32_t>(grid[0]) : 1u),
-             (grid.size() > 1 ? static_cast<uint32_t>(grid[1]) : 1u),
-             (grid.size() > 2 ? static_cast<uint32_t>(grid[2]) : 1u)};
-  // Prepare the cluster size.
-  dim3 cluster3{static_cast<uint32_t>(options.mClusterDimX),
-                static_cast<uint32_t>(options.mClusterDimY),
-                static_cast<uint32_t>(options.mClusterDimZ)};
-
-  // Run the kernel.
-  auto result = trtllm::gen::launchKernel(
-      (void*)&kernelParams, cudaStream, config.mSharedMemSize, cuFunction, block3, grid3, cluster3,
-      usePdl && (config.mOptions.mGridWaitForPrimaryEarlyExit |
-                 config.mOptions.mGridWaitForPrimaryA | config.mOptions.mGridWaitForPrimaryB));
-  if (result != CUDA_SUCCESS) {
-    return -1;
-  }
-  // If a module cache has not been given, unload the module to avoid leaking
-  if (!moduleCache.has_value()) {
-    cuModuleUnload(cuModule);
+  //////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Returns the size padded to the alignment
+  size_t getSizePaddedToAlignment(size_t size, size_t alignment) const {
+    assert((alignment & (alignment - 1)) == 0);
+    return (size + alignment - 1) & ~(alignment - 1);
   }
-#else
-  config.mCudaRunner->run((void*)&kernelParams, (void*)cudaStream, grid);
-#endif
+  //////////////////////////////////////////////////////////////////////////////////////////////////
 
-  return 0;
-}
+ private:
+  // Whether to export the cubin file.
+  bool mExportsCubin;
+  // The number of rotations.
+  int32_t mNumRotations;
+};
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmOptions.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmOptions.h
index 07dcd30be4..f3e73a5aac 100644
--- a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmOptions.h
+++ b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmOptions.h
@@ -55,6 +55,13 @@
 
 namespace batchedGemm {
 
+namespace trtllm {
+namespace gen {
+class CudaRunner;
+class GenCfg;
+}  // namespace gen
+}  // namespace trtllm
+
 namespace batchedGemm {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -78,13 +85,14 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions {
       gemm::AllReduceAlgo allReduceAlgo, gemm::BiasType biasType, int blockK, int clusterDimX,
       int clusterDimY, int clusterDimZ, gemm::CtaSwizzleType ctaSwizzleType, tg::Dtype dtypeAcc,
       tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeMmaA,
-      tg::Dtype dtypeMmaB, bool enablesEarlyExit, bool enablesDelayedEarlyExit,
-      bool enablesGlobalPtxKnobs, int epilogueLdtmDps, int epilogueLdtmBits, int epilogueTileM,
-      int epilogueTileN, bool gridTriggerSecondaryA, bool gridTriggerSecondaryB,
-      bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA, bool gridWaitForPrimaryB,
-      bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, gemm::KernelTraits kernelTraits,
-      gemm::MatrixLayout layoutA, gemm::MatrixLayout layoutB, int m, int mmaK, tg::MmaKind mmaKind,
-      int mmaM, int mmaN, bool mockAllReduce, int n, int numRegsCastAWarps,
+      tg::Dtype dtypeMmaB, gemm::EltwiseActType eltwiseActType, bool enablesEarlyExit,
+      bool enablesDelayedEarlyExit, bool enablesGlobalPtxKnobs, int epilogueLdtmDps,
+      int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, bool fuseUtccpWithUtcmma,
+      bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit,
+      bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit,
+      bool hoistMmaTaskTryWaits, int k, gemm::KernelTraits kernelTraits, gemm::MatrixLayout layoutA,
+      gemm::MatrixLayout layoutB, int m, int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN,
+      bool mockAllReduce, int n, int numEpilogueWarps, int numRegsCastAWarps,
       int numRegsCopySfLdsSttm, int numRegsPerThreadEpilogueWarp,
       int numRegsPerThreadNonEpilogueWarp, int numSlicesForSplitK, int numSlicesForSliceK,
       int numStages, int numStagesMma, int numStagesMmaWithinWorkTile,
@@ -93,29 +101,35 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions {
       tg::SfLayout sfLayoutC, int32_t sfReshapeFactor, bool sliceK, gemm::SplitK splitK, int tileK,
       int tileM, int tileN, gemm::TileScheduler tileScheduler, bool transposeMmaOutput,
       bool useCustomMmaSchedule, bool useDeepSeekFp8, bool useHoistTryWaitForCustomMmaSchedule,
-      bool usePerTokenSfA, bool usePerTokenSfB, bool useShuffledMatrixA, bool useTmaStore,
-      bool useTwoTmaLoadWarps, bool useTwoMmaWarps, bool useUnrollLoop2xForMma, int worldSize,
-      gemmGatedAct::ActType actType, bool clampBeforeAct, std::vector<int> batchedM,
-      std::vector<int> batchedN, BatchMode batchMode, int numBatches, bool isStaticBatch,
-      int numTokens, RouteImpl routeImpl, std::optional<RouteImpl> routeSfsImpl,
-      bool gridWaitForPrimaryRouting, bool fusedAct, bool useTmaOobOpt)
+      bool useMaxTmemOverlap, bool usePerTokenSfA, bool usePerTokenSfB, bool useShuffledMatrixA,
+      bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, bool useUnrollLoop2xForMma,
+      int validM, int validN, int validK, int worldSize,
+      // GemmGatedActOptions
+      gemmGatedAct::ActType actType, bool clampBeforeAct,
+      // BatchedGemmOptions
+      std::vector<int> batchedM, std::vector<int> batchedN, BatchMode batchMode, bool fusedAct,
+      bool gridWaitForPrimaryRouting, bool isStaticBatch, int numBatches, int numRegsPerThreadLoadB,
+      int numRegsPerThreadLoadSfB, int numTokens, int numWarpsLoadB, int numWarpsLoadSfB,
+      RouteImpl routeImpl, std::optional<RouteImpl> routeSfsImpl, bool useTmaOobOpt)
       : gemmGatedAct::GemmGatedActOptions(
             gemm::GemmOptions(
                 allReduceAlgo, biasType, blockK, clusterDimX, clusterDimY, clusterDimZ,
                 ctaSwizzleType, dtypeAcc, dtypeA, dtypeB, dtypeC, dtypeMmaA, dtypeMmaB,
-                enablesEarlyExit, enablesDelayedEarlyExit, enablesGlobalPtxKnobs, epilogueLdtmDps,
-                epilogueLdtmBits, epilogueTileM, epilogueTileN, gridTriggerSecondaryA,
-                gridTriggerSecondaryB, gridWaitForPrimaryEarlyExit, gridWaitForPrimaryA,
-                gridWaitForPrimaryB, hoistLoadTaskInit, hoistMmaTaskTryWaits, k, kernelTraits,
-                layoutA, layoutB, m, mmaK, mmaKind, mmaM, mmaN, mockAllReduce, n,
-                numRegsCopySfLdsSttm, numSlicesForSplitK, numSlicesForSliceK, numStages,
-                numStagesMma, numStagesMmaWithinWorkTile, numStagesMmaAcrossWorkTile,
-                numStagesWorkId, outputDebugTensors, patchF2fp, sfBlockSizeA, sfLayoutA, sfLayoutB,
-                sfLayoutC, sfReshapeFactor, sliceK, splitK, tileK, tileM, tileN, tileScheduler,
+                eltwiseActType, enablesEarlyExit, enablesDelayedEarlyExit, enablesGlobalPtxKnobs,
+                epilogueLdtmDps, epilogueLdtmBits, epilogueTileM, epilogueTileN,
+                fuseUtccpWithUtcmma, gridTriggerSecondaryA, gridTriggerSecondaryB,
+                gridWaitForPrimaryEarlyExit, gridWaitForPrimaryA, gridWaitForPrimaryB,
+                hoistLoadTaskInit, hoistMmaTaskTryWaits, k, kernelTraits, layoutA, layoutB, m, mmaK,
+                mmaKind, mmaM, mmaN, mockAllReduce, n, numEpilogueWarps, numRegsCastAWarps,
+                numRegsCopySfLdsSttm, numRegsPerThreadEpilogueWarp, numRegsPerThreadNonEpilogueWarp,
+                numSlicesForSplitK, numSlicesForSliceK, numStages, numStagesMma,
+                numStagesMmaWithinWorkTile, numStagesMmaAcrossWorkTile, numStagesWorkId,
+                outputDebugTensors, patchF2fp, sfBlockSizeA, sfLayoutA, sfLayoutB, sfLayoutC,
+                sfReshapeFactor, sliceK, splitK, tileK, tileM, tileN, tileScheduler,
                 transposeMmaOutput, useCustomMmaSchedule, useDeepSeekFp8,
-                useHoistTryWaitForCustomMmaSchedule, usePerTokenSfA, usePerTokenSfB,
-                useShuffledMatrixA, useTmaStore, useTwoTmaLoadWarps, useTwoMmaWarps,
-                useUnrollLoop2xForMma, worldSize),
+                useHoistTryWaitForCustomMmaSchedule, useMaxTmemOverlap, usePerTokenSfA,
+                usePerTokenSfB, useShuffledMatrixA, useTmaStore, useTwoTmaLoadWarps, useTwoMmaWarps,
+                useUnrollLoop2xForMma, validM, validN, validK, worldSize),
             actType, clampBeforeAct),
         mBatchedM(batchedM),
         mBatchedN(batchedN),
@@ -124,10 +138,11 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions {
         mGridWaitForPrimaryRouting(gridWaitForPrimaryRouting),
         mIsStaticBatch(isStaticBatch),
         mNumBatches(numBatches),
-        mNumRegsPerThreadNonEpilogueWarp(numRegsPerThreadNonEpilogueWarp),
-        mNumRegsPerThreadEpilogueWarp(numRegsPerThreadEpilogueWarp),
-        mNumRegsCastAWarps(numRegsCastAWarps),
+        mNumRegsPerThreadLoadB{numRegsPerThreadLoadB},
+        mNumRegsPerThreadLoadSfB{numRegsPerThreadLoadSfB},
         mNumTokens(numTokens),
+        mNumWarpsLoadB{numWarpsLoadB},
+        mNumWarpsLoadSfB{numWarpsLoadSfB},
         mRouteImpl(routeImpl),
         mRouteSfsImpl(routeSfsImpl),
         mUseTmaOobOpt(useTmaOobOpt) {}
@@ -147,14 +162,16 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions {
   bool mIsStaticBatch{true};
   // Number of Gemm batches.
   int mNumBatches;
-  // Number of registers per thread for non-epilogue warps
-  int mNumRegsPerThreadNonEpilogueWarp{0};
-  // Number of registers per thread for epilogue warps
-  int mNumRegsPerThreadEpilogueWarp{0};
-  // Number of registers for the cast A warps.
-  int mNumRegsCastAWarps{0};
+  // Number of registers per thread for load B
+  int mNumRegsPerThreadLoadB{0};
+  // Number of registers per thread for load SfB
+  int mNumRegsPerThreadLoadSfB{0};
   // Total number of tokens.
   int mNumTokens{32};
+  // Number of warps for load B
+  int mNumWarpsLoadB{0};
+  // Number of warps for load SfB
+  int mNumWarpsLoadSfB{0};
   // Whether load the input tokens and do routing.
   RouteImpl mRouteImpl{RouteImpl::NoRoute};
   // Routing logic for scaling factors. If not specified, mRouteImpl is used.
@@ -167,8 +184,8 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions {
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 // Check if the options are valid or not.
-bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackwell,
-                                      bool updateOptions = true) {
+inline bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, tg::CudaArch cudaArch,
+                                             bool updateOptions = true) {
   bool isValid = true;
   if (options.mUseTmaOobOpt && !options.mUseTwoTmaLoadWarps) {
     if (updateOptions) {
@@ -182,10 +199,9 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw
   }
   if (options.mFusedAct) {
     // ensure that we check the fused options as well
-    isValid = gemmGatedAct::checkAndUpdateGemmGatedActOptions(options, isBlackwell, updateOptions);
+    isValid = gemmGatedAct::checkAndUpdateGemmGatedActOptions(options, cudaArch, updateOptions);
   } else {
-    isValid =
-        gemm::checkAndUpdateGemmOptions(options, isBlackwell, 1 /* tpGrpSize */, updateOptions);
+    isValid = gemm::checkAndUpdateGemmOptions(options, cudaArch, 1 /* tpGrpSize */, updateOptions);
   }
 
   bool batchM = options.mBatchMode == BatchedGemmOptions::BatchMode::BatchM;
@@ -222,19 +238,21 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw
   if (options.mUseDeepSeekFp8) {
     if (batchM) {
       // Make sure the GEMM-K dimension is a multiple of 128 when using DeepSeek FP8.
-      TLLM_CHECK_ERROR(options.mN % 128 == 0,
-                       "GEMM-N must be a multiple of 128 when using DeepSeek Fp8. Found ",
-                       options.mN);
+      TLLM_CHECK_ERROR(
+          options.mN % 128 == 0 && options.mValidN % 128 == 0,
+          "GEMM-N and validN must be a multiple of 128 when using DeepSeek Fp8. Found ", options.mN,
+          " and validN=", options.mValidN);
     } else {
       // Make sure the GEMM-K dimension is a multiple of 128 when using DeepSeek FP8.
-      TLLM_CHECK_ERROR(options.mM % 128 == 0,
-                       "GEMM-N must be a multiple of 128 when using DeepSeek Fp8. Found ",
-                       options.mN);
+      TLLM_CHECK_ERROR(
+          options.mM % 128 == 0 && options.mValidM % 128 == 0,
+          "GEMM-M and validM must be a multiple of 128 when using DeepSeek Fp8. Found ", options.mM,
+          " and validM=", options.mValidM);
     }
     // Make sure the GEMM-K dimension is a multiple of 128 when using DeepSeek FP8.
-    TLLM_CHECK_ERROR(options.mK % 128 == 0,
-                     "GEMM-K must be a multiple of 128 when using DeepSeek Fp8. Found ",
-                     options.mK);
+    TLLM_CHECK_ERROR(options.mK % 128 == 0 && options.mValidK % 128 == 0,
+                     "GEMM-K and validK must be a multiple of 128 when using DeepSeek Fp8. Found ",
+                     options.mK, " and validK=", options.mValidK);
 
     TLLM_CHECK_ERROR(options.mDtypeC != tg::Dtype::E2m1 && options.mDtypeA == tg::Dtype::E4m3 &&
                          options.mDtypeB == tg::Dtype::E4m3,
@@ -243,8 +261,10 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw
 
   if (options.mRouteSfsImpl.has_value() && options.mRouteSfsImpl.value() != options.mRouteImpl) {
     TLLM_CHECK_ERROR(
-        options.mRouteSfsImpl.value() == RouteImpl::Ldgsts && options.mRouteImpl == RouteImpl::Tma,
-        "RouteSfsImpl must be equal to RouteImpl, or Ldgsts, when RouteImpl is Tma");
+        (options.mRouteSfsImpl.value() == RouteImpl::Ldgsts ||
+         options.mRouteSfsImpl.value() == RouteImpl::LdgPlusSts) &&
+            options.mRouteImpl == RouteImpl::Tma,
+        "RouteSfsImpl must be equal to RouteImpl, or Ldgsts/LdgPlusSts, when RouteImpl is Tma");
   } else if (!options.mRouteSfsImpl.has_value()) {
     if (updateOptions) {
       options.mRouteSfsImpl = options.mRouteImpl;
@@ -253,6 +273,16 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw
       return false;
     }
   }
+
+  TLLM_CHECK_ERROR(options.mRouteImpl != RouteImpl::LdgPlusSts,
+                   "LdgPlusSts does not support routing the tokens");
+
+  if (options.mRouteSfsImpl.has_value() && options.mRouteSfsImpl.value() == RouteImpl::LdgPlusSts) {
+    TLLM_CHECK_ERROR(!batchM, "LdgPlusSts only supports batch N");
+    TLLM_CHECK_ERROR(options.mTileK <= 512 && options.mTileK >= 128,
+                     "LdgPlusSts only supports 128 <= tileK <= 512");
+  }
+
   if (batchM) {
     if (options.mDtypeA == tg::Dtype::MxE2m1 && options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) {
       TLLM_CHECK_ERROR(doesRouteImplUseNoRoute(options.mRouteImpl),
@@ -317,8 +347,17 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw
 
   // We do not handle the case where K is not a multiple of TileK.
   // TMA based load handles the case transparently.
-  if (doesRouteImplUseLdgsts(options.mRouteImpl)) {
-    TLLM_CHECK_ERROR(options.mK % options.mTileK == 0, "K must be a multiple of TileK");
+  if (doesRouteImplUseLdgsts(options.mRouteImpl) &&
+      doesRouteImplUseLdgPlusSts(options.mRouteSfsImpl.value())) {
+    TLLM_CHECK_ERROR(options.mK % options.mTileK == 0,
+                     "K must be a multiple of TileK when using Ldg based routing");
+  }
+
+  if (options.mRouteSfsImpl.has_value() &&
+      (doesRouteImplUseLdgsts(options.mRouteSfsImpl.value()) ||
+       doesRouteImplUseLdgPlusSts(options.mRouteSfsImpl.value()))) {
+    TLLM_CHECK_ERROR(options.mK % options.mTileK == 0,
+                     "K must be a multiple of tileK when using Ldg based SF routing");
   }
 
   if (options.mClusterDimX > 1 && batchM && options.mRouteImpl != RouteImpl::NoRoute) {
@@ -326,6 +365,7 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw
                      "2CTA BatchedGemm does not support routing along M dimension. To support it, "
                      "change the input routing data layout to be padded to clusterDimX size.");
   }
+
   return isValid;
 }
 
@@ -336,47 +376,51 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 struct BatchedGemmConfig {
-  // When TRT-LLM Gen is exported to the other frameworks, the TLLM_GEN_EXPORT_INTERFACE must be
-  // defined. In this case, the cubins will be loaded from the provided data and function name.
-  // Otherwise, the kernel will be loaded from the CudaRunner.
-#ifdef TLLM_GEN_EXPORT_INTERFACE
   uint8_t const* mData{nullptr};
-  uint32_t const mSize{0};
-  uint32_t const mSharedMemSize{0};
+  uint32_t mSize{0};
+  uint32_t mSharedMemSize{0};
   char const* mFunctionName{nullptr};
-  uint32_t const mNumThreadsPerCTA{0};
+  uint32_t mNumThreadsPerCTA{0};
   char const* mHash{nullptr};
-#else
+
+  std::string mGenCfgJsonStr{""};
+  char const* mExecPath{nullptr};
   trtllm::gen::CudaRunner* mCudaRunner{nullptr};
-#endif
+  trtllm::gen::GenCfg* mGenCfg{nullptr};
+  int32_t mInstanceIdx{0};
 
   BatchedGemmOptions mOptions;
-  gemm::SmVersion mSm{gemm::SmVersion::Sm100a};
+  tg::CudaArch mSm{tg::CudaArch::Sm100a};
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-inline std::string dumpOptions(BatchedGemmOptions const& options) {
+inline std::string dumpOptions(BatchedGemmOptions const& options, bool dumpRuntimeParams = true) {
   std::stringstream ss;
-  ss << gemmGatedAct::dumpOptions(options) << ", ";
-  ss << "mBatchedM={}," << std::endl;
-  ss << "mBatchedN={}," << std::endl;
+  ss << gemmGatedAct::dumpOptions(options, dumpRuntimeParams) << ", ";
+  if (dumpRuntimeParams) {
+    ss << "mBatchedM={}," << std::endl;
+    ss << "mBatchedN={}," << std::endl;
+  }
   ss << "mBatchMode=batchedGemm::BatchedGemmOptions::BatchMode("
      << static_cast<int32_t>(options.mBatchMode) << ")," << std::endl;
-  ss << "mNumBatches=" << options.mNumBatches << "," << std::endl;
+  ss << "mFusedAct=" << options.mFusedAct << "," << std::endl;
+  ss << "mGridWaitForPrimaryRouting=" << options.mGridWaitForPrimaryRouting << "," << std::endl;
   ss << "mIsStaticBatch=" << options.mIsStaticBatch << "," << std::endl;
-  ss << "mNumTokens=" << options.mNumTokens << "," << std::endl;
+  if (dumpRuntimeParams) {
+    ss << "mNumBatches=" << options.mNumBatches << "," << std::endl;
+  }
+  ss << "mNumRegsPerThreadLoadB=" << options.mNumRegsPerThreadLoadB << "," << std::endl;
+  ss << "mNumRegsPerThreadLoadSfB=" << options.mNumRegsPerThreadLoadSfB << "," << std::endl;
+  if (dumpRuntimeParams) {
+    ss << "mNumTokens=" << options.mNumTokens << "," << std::endl;
+  }
+  ss << "mNumWarpsLoadB=" << options.mNumWarpsLoadB << "," << std::endl;
+  ss << "mNumWarpsLoadSfB=" << options.mNumWarpsLoadSfB << "," << std::endl;
   ss << "mRouteImpl=batchedGemm::RouteImpl(" << static_cast<int32_t>(options.mRouteImpl) << "),"
      << std::endl;
   ss << "mRouteSfsImpl={batchedGemm::RouteImpl("
      << static_cast<int32_t>(options.mRouteSfsImpl.value()) << ")}," << std::endl;
-  ss << "mGridWaitForPrimaryRouting=" << options.mGridWaitForPrimaryRouting << "," << std::endl;
-  ss << "mFusedAct=" << options.mFusedAct << "," << std::endl;
-  ss << "mNumRegsPerThreadNonEpilogueWarp=" << options.mNumRegsPerThreadNonEpilogueWarp << ","
-     << std::endl;
-  ss << "mNumRegsPerThreadEpilogueWarp=" << options.mNumRegsPerThreadEpilogueWarp << ","
-     << std::endl;
-  ss << "mNumRegsCastAWarps=" << options.mNumRegsCastAWarps << "," << std::endl;
   ss << "mUseTmaOobOpt=" << options.mUseTmaOobOpt << std::endl;
   return ss.str();
 }
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/Enums.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/Enums.h
index e9d5a23a65..2a1c371ad8 100644
--- a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/Enums.h
+++ b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/Enums.h
@@ -87,6 +87,17 @@ enum class BiasType : uint32_t {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+// Type of the element-wise activation to apply after the Gemm
+enum class EltwiseActType {
+  None = 0,
+  // Relu2 (also known as squared Relu) is defined as the following operation:
+  // act = relu(x0) ^ 2
+  // where x0 is the output of the Gemm.
+  Relu2,
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 enum class TileScheduler {
   // Static scheduler (Non-persistent).
   Static = 0,
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/GemmGatedActOptions.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/GemmGatedActOptions.h
index 1086cd4fd5..9fb4a010a4 100644
--- a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/GemmGatedActOptions.h
+++ b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/GemmGatedActOptions.h
@@ -45,6 +45,13 @@
 
 namespace batchedGemm {
 
+namespace trtllm {
+namespace gen {
+class CudaRunner;
+class GenCfg;
+}  // namespace gen
+}  // namespace trtllm
+
 namespace gemmGatedAct {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -53,15 +60,16 @@ namespace tg = trtllm::gen;
 
 // Type of the gated activation
 enum class ActType {
+  // clang-format off
   // For ActType == SwiGlu, ideally we would like to have something like
-  //    gatedAct = quantScaleC * (x0 * dequantScaleAb + beta) * ((x1 * scaleGate) *
-  //    sigmoid(alpha * x1 * scaleGate)).
+  //    gatedAct = quantScaleC * (x0 * dequantScaleAb + beta) * ((x1 * scaleGate) * sigmoid(alpha * x1 * scaleGate)).
   // But for now, we use the simplified version
-  //    gatedAct = scaleC * (x0 + beta') * ((x1 * scaleGate) * sigmoid(alpha * x1 * scaleGate)),
+  //    gatedAct = scaleC      * (x0                 + beta') * ((x1 * scaleGate) * sigmoid(alpha * x1 * scaleGate)),
   // where x0 and x1 are the raw numbers from Gemm, while scaleC and scaleGate are input scales,
   // beta' = beta / dequantScaleAb, scaleC = quantScaleC * dequantScaleAb.
   //
   // GatedSilu is a special case of SwiGlu where the alpha is 1.0 and the beta is 0.0.
+  // clang-format on
   SwiGlu,
   // For ActType == GeGlu, we use the simplified version
   //    gatedAct = scaleC' * (x0 + beta') * ((x1 * scaleGate) * phi(alpha * x1 * scaleGate)),
@@ -112,7 +120,7 @@ struct GemmGatedActOptions : public gemm::GemmOptions {
 
 // Check if the options are valid or not.
 inline bool checkAndUpdateGemmGatedActOptions(gemmGatedAct::GemmGatedActOptions& options,
-                                              bool isBlackwell, bool updateOptions = true) {
+                                              tg::CudaArch cudaArch, bool updateOptions = true) {
   // tmpOut is already transposed at this stage
   auto const hiddenSizeStr = options.mTransposeMmaOutput ? "M" : "N";
   auto const hiddenSize = options.mTransposeMmaOutput ? options.mM : options.mN;
@@ -130,10 +138,6 @@ inline bool checkAndUpdateGemmGatedActOptions(gemmGatedAct::GemmGatedActOptions&
         "Unsupported output hidden tile size");
   }
 
-  if (options.mUseDeepSeekFp8) {
-    TLLM_CHECK_ERROR(hiddenSize % 256 == 0, "Output hidden size must be a multiple of 256");
-  }
-
   if (options.mDtypeC == tg::Dtype::E2m1 || options.mDtypeC == tg::Dtype::MxE4m3) {
     int const outHiddenSize = (options.mTransposeMmaOutput ? options.mM : options.mN) / 2;
     int const hiddenGranularity = 4 * tg::dtypeNumEltsPerSf(options.mDtypeC);
@@ -141,13 +145,28 @@ inline bool checkAndUpdateGemmGatedActOptions(gemmGatedAct::GemmGatedActOptions&
                      ") must be a multiple of ", hiddenGranularity, " for block-scaled outputs.");
   }
 
-  auto isValid = gemm::checkAndUpdateGemmOptions(options, isBlackwell,
+  auto isValid = gemm::checkAndUpdateGemmOptions(options, cudaArch,
                                                  /* tpGrpSize */ 1, updateOptions);
 
   if (!isValid) {
     return false;
   }
 
+  auto const validHiddenSize = options.mTransposeMmaOutput ? options.mValidM : options.mValidN;
+  if (options.mUseDeepSeekFp8) {
+    TLLM_CHECK_ERROR(hiddenSize % 256 == 0 && validHiddenSize % 256 == 0, "Hidden size (",
+                     hiddenSize, ") and valid hidden size (", validHiddenSize,
+                     ") must be a multiple of 256");
+  }
+
+  //
+  if (options.mUseShuffledMatrixA) {
+    auto const shuffleBlockSize = gemm::getShuffleBlockSize(options.mEpilogueTileM);
+    TLLM_CHECK_ERROR(
+        hiddenSize % (2 * shuffleBlockSize) == 0 && validHiddenSize % (2 * shuffleBlockSize) == 0,
+        "M/validM must be a multiple of 2 * shuffle block size (", 2 * shuffleBlockSize,
+        ") when useShuffledMatrixA");
+  }
   if (options.mNumSlicesForSplitK > 1) {
     TLLM_CHECK_ERROR(doesSplitKUseDsmem(options.mSplitK),
                      "Split-k GMEM and GemmGatedAct are not supported yet.");
@@ -163,11 +182,11 @@ inline bool checkAndUpdateGemmGatedActOptions(gemmGatedAct::GemmGatedActOptions&
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-inline std::string dumpOptions(GemmGatedActOptions const& options) {
+inline std::string dumpOptions(GemmGatedActOptions const& options, bool dumpRuntimeParams = true) {
   std::stringstream ss;
-  ss << gemm::dumpOptions(options) << ", ";
-  ss << "mActType=" << "gemmGatedAct::ActType(" << static_cast<int32_t>(options.mActType) << "),"
-     << std::endl;
+  ss << gemm::dumpOptions(options, dumpRuntimeParams) << ", ";
+  ss << "mActType="
+     << "gemmGatedAct::ActType(" << static_cast<int32_t>(options.mActType) << ")," << std::endl;
   ss << "mClampBeforeAct=" << options.mClampBeforeAct << "" << std::endl;
   return ss.str();
 }
@@ -179,22 +198,21 @@ inline std::string dumpOptions(GemmGatedActOptions const& options) {
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 struct GemmGatedActConfig {
-  // When TRT-LLM Gen is exported to the other frameworks, the TLLM_GEN_EXPORT_INTERFACE must be
-  // defined. In this case, the cubins will be loaded from the provided data and function name.
-  // Otherwise, the kernel will be loaded from the CudaRunner.
-#ifdef TLLM_GEN_EXPORT_INTERFACE
   uint8_t const* mData{nullptr};
-  uint32_t const mSize{0};
-  uint32_t const mSharedMemSize{0};
+  uint32_t mSize{0};
+  uint32_t mSharedMemSize{0};
   char const* mFunctionName{nullptr};
-  uint32_t const mNumThreadsPerCTA{0};
+  uint32_t mNumThreadsPerCTA{0};
   char const* mHash{nullptr};
-#else
+
+  std::string mGenCfgJsonStr{""};
+  char const* mExecPath{nullptr};
   trtllm::gen::CudaRunner* mCudaRunner{nullptr};
-#endif
+  trtllm::gen::GenCfg* mGenCfg{nullptr};
+  int32_t mInstanceIdx{0};
 
   GemmGatedActOptions mOptions{};
-  gemm::SmVersion mSm{gemm::SmVersion::Sm100a};
+  tg::CudaArch mSm{tg::CudaArch::Sm100a};
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/GemmOptions.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/GemmOptions.h
index fc3bd88101..54daac4a8d 100644
--- a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/GemmOptions.h
+++ b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/GemmOptions.h
@@ -23,6 +23,7 @@
 #include "Enums.h"
 #include "KernelParams.h"
 #include "KernelTraits.h"
+#include "trtllm/gen/CudaArchDecl.h"
 #include "trtllm/gen/DtypeDecl.h"
 #include "trtllm/gen/MmaDecl.h"
 #include "trtllm/gen/SfLayoutDecl.h"
@@ -30,7 +31,14 @@
 #include "trtllm/gen/CudaRunner.h"
 #include "trtllm/gen/GenCtx.h"
 #else
+#ifdef TLLM_GEN_EXPORT_FLASHINFER
+#include <string>
+namespace flashinfer::trtllm_cubin_loader {
+std::string getCubin(const std::string& kernelName, const std::string& sha256);
+}
+#endif  // TLLM_GEN_EXPORT_FLASHINFER
 #include <iostream>
+namespace batchedGemm {
 
 template <typename T>
 void printArgs(T arg) {
@@ -72,7 +80,12 @@ void printArgs(T first, Args... args) {
 
 #endif  // TLLM_GEN_EXPORT_INTERFACE
 
-namespace batchedGemm {
+namespace trtllm {
+namespace gen {
+class CudaRunner;
+class GenCfg;
+}  // namespace gen
+}  // namespace trtllm
 
 namespace gemm {
 
@@ -91,28 +104,29 @@ struct GemmOptions {
 #endif
 
   GemmOptions() = default;
-
   GemmOptions(AllReduceAlgo allReduceAlgo, BiasType biasType, int blockK, int clusterDimX,
               int clusterDimY, int clusterDimZ, CtaSwizzleType ctaSwizzleType, tg::Dtype dtypeAcc,
               tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeMmaA,
-              tg::Dtype dtypeMmaB, bool enablesEarlyExit, bool enablesDelayedEarlyExit,
-              bool enablesGlobalPtxKnobs, int epilogueLdtmDps, int epilogueLdtmBits,
-              int epilogueTileM, int epilogueTileN, bool gridTriggerSecondaryA,
-              bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit,
-              bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit,
-              bool hoistMmaTaskTryWaits, int k, KernelTraits kernelTraits, MatrixLayout layoutA,
-              MatrixLayout layoutB, int m, int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN,
-              bool mockAllReduce, int n, int numRegsCopySfLdsSttm, int numSlicesForSplitK,
-              int numSlicesForSliceK, int numStages, int numStagesMma,
-              int numStagesMmaWithinWorkTile, int numStagesMmaAcrossWorkTile, int numStagesWorkId,
-              bool outputDebugTensors, bool patchF2fp, std::optional<int32_t> sfBlockSizeA,
-              tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC,
-              int sfReshapeFactor, bool sliceK, SplitK splitK, int tileK, int tileM, int tileN,
-              TileScheduler tileScheduler, bool transposeMmaOutput, bool useCustomMmaSchedule,
-              bool useDeepSeekFp8, bool useHoistTryWaitForCustomMmaSchedule, bool usePerTokenSfA,
+              tg::Dtype dtypeMmaB, EltwiseActType eltwiseActType, bool enablesEarlyExit,
+              bool enablesDelayedEarlyExit, bool enablesGlobalPtxKnobs, int epilogueLdtmDps,
+              int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, bool fuseUtccpWithUtcmma,
+              bool gridTriggerSecondaryA, bool gridTriggerSecondaryB,
+              bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA, bool gridWaitForPrimaryB,
+              bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, KernelTraits kernelTraits,
+              MatrixLayout layoutA, MatrixLayout layoutB, int m, int mmaK, tg::MmaKind mmaKind,
+              int mmaM, int mmaN, bool mockAllReduce, int n, int numEpilogueWarps,
+              int numRegsCastAWarps, int numRegsCopySfLdsSttm, int numRegsPerThreadEpilogueWarp,
+              int numRegsPerThreadNonEpilogueWarp, int numSlicesForSplitK, int numSlicesForSliceK,
+              int numStages, int numStagesMma, int numStagesMmaWithinWorkTile,
+              int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors,
+              bool patchF2fp, std::optional<int32_t> sfBlockSizeA, tg::SfLayout sfLayoutA,
+              tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC, int sfReshapeFactor, bool sliceK,
+              SplitK splitK, int tileK, int tileM, int tileN, TileScheduler tileScheduler,
+              bool transposeMmaOutput, bool useCustomMmaSchedule, bool useDeepSeekFp8,
+              bool useHoistTryWaitForCustomMmaSchedule, bool useMaxTmemOverlap, bool usePerTokenSfA,
               bool usePerTokenSfB, bool useShuffledMatrixA, bool useTmaStore,
-              bool useTwoTmaLoadWarps, bool useTwoMmaWarps, bool useUnrollLoop2xForMma,
-              int worldSize)
+              bool useTwoTmaLoadWarps, bool useTwoMmaWarps, bool useUnrollLoop2xForMma, int validM,
+              int validN, int validK, int worldSize)
       : mAllReduceAlgo{allReduceAlgo},
         mBiasType{biasType},
         mBlockK(blockK),
@@ -126,6 +140,7 @@ struct GemmOptions {
         mDtypeC{dtypeC},
         mDtypeMmaA{dtypeMmaA},
         mDtypeMmaB{dtypeMmaB},
+        mEltwiseActType{eltwiseActType},
         mEnablesEarlyExit{enablesEarlyExit},
         mEnablesDelayedEarlyExit{enablesDelayedEarlyExit},
         mEnablesGlobalPtxKnobs{enablesGlobalPtxKnobs},
@@ -133,6 +148,7 @@ struct GemmOptions {
         mEpilogueLdtmBits{epilogueLdtmBits},
         mEpilogueTileM{epilogueTileM},
         mEpilogueTileN{epilogueTileN},
+        mFuseUtccpWithUtcmma{fuseUtccpWithUtcmma},
         mGridTriggerSecondaryA{gridTriggerSecondaryA},
         mGridTriggerSecondaryB{gridTriggerSecondaryB},
         mGridWaitForPrimaryEarlyExit{gridWaitForPrimaryEarlyExit},
@@ -151,7 +167,11 @@ struct GemmOptions {
         mMmaN{mmaN},
         mMockAllReduce{mockAllReduce},
         mN{n},
+        mNumEpilogueWarps{numEpilogueWarps},
+        mNumRegsCastAWarps(numRegsCastAWarps),
         mNumRegsCopySfLdsSttm(numRegsCopySfLdsSttm),
+        mNumRegsPerThreadEpilogueWarp(numRegsPerThreadEpilogueWarp),
+        mNumRegsPerThreadNonEpilogueWarp(numRegsPerThreadNonEpilogueWarp),
         mNumSlicesForSplitK{numSlicesForSplitK},
         mNumSlicesForSliceK{numSlicesForSliceK},
         mNumStages{numStages},
@@ -176,6 +196,7 @@ struct GemmOptions {
         mUseCustomMmaSchedule{useCustomMmaSchedule},
         mUseDeepSeekFp8{useDeepSeekFp8},
         mUseHoistTryWaitForCustomMmaSchedule{useHoistTryWaitForCustomMmaSchedule},
+        mUseMaxTmemOverlap{useMaxTmemOverlap},
         mUsePerTokenSfA{usePerTokenSfA},
         mUsePerTokenSfB{usePerTokenSfB},
         mUseShuffledMatrixA{useShuffledMatrixA},
@@ -183,8 +204,10 @@ struct GemmOptions {
         mUseTwoTmaLoadWarps{useTwoTmaLoadWarps},
         mUseTwoMmaWarps{useTwoMmaWarps},
         mUseUnrollLoop2xForMma{useUnrollLoop2xForMma},
+        mValidM{validM},
+        mValidN{validN},
+        mValidK{validK},
         mWorldSize{worldSize} {}
-
   // The all-reduce algorithm.
   AllReduceAlgo mAllReduceAlgo{AllReduceAlgo::None};
   // The type of bias.
@@ -211,6 +234,8 @@ struct GemmOptions {
   tg::Dtype mDtypeMmaA{tg::Dtype::Void};
   // Data type of the B matrix for the MMA, if different from the input type.
   tg::Dtype mDtypeMmaB{tg::Dtype::Void};
+  // The type of activation.
+  EltwiseActType mEltwiseActType{EltwiseActType::None};
   // Whether to enable early exit.
   bool mEnablesEarlyExit{false};
   // Whether to enable delayed early exit to overlap
@@ -233,6 +258,8 @@ struct GemmOptions {
   int mEpilogueTileM{128};
   // Tile size for the epilogue in N dimension.
   int mEpilogueTileN{32};
+  // Whether fuse UTCCP with UTC*MMA.
+  bool mFuseUtccpWithUtcmma{false};
   // Whether load task A triggers the next grid.
   bool mGridTriggerSecondaryA{false};
   // Whether load task B triggers the next grid.
@@ -269,8 +296,16 @@ struct GemmOptions {
   bool mMockAllReduce{false};
   // The N dimension of GEMM.
   int mN{64 * 4};
+  // Number of Epilogue Warps
+  int mNumEpilogueWarps{4};
+  // Number of registers for the cast A warps.
+  int mNumRegsCastAWarps{0};
   // Number of registers for the LDS+STTM warps.
   int mNumRegsCopySfLdsSttm{0};
+  // Number of registers per thread for epilogue warps
+  int mNumRegsPerThreadEpilogueWarp{0};
+  // Number of registers per thread for non-epilogue warps
+  int mNumRegsPerThreadNonEpilogueWarp{0};
   // Number of partitions along the K dimension. When mNumSlicesForSplitK > 1,
   // the problem is distributed across several SMs, where each CTA works on its local K slice.
   // Partial results are accumulated afterwards using either GMEM or DSMEM (in CGA)
@@ -329,6 +364,8 @@ struct GemmOptions {
   // k-block. It benefits when the next k-block is already available and thus sustaining the
   // momentum, but it adds latency to the first k-block for smaller k-loop.
   bool mUseHoistTryWaitForCustomMmaSchedule{false};
+  // Whether use the max Tmem overlap trick.
+  bool mUseMaxTmemOverlap{false};
   // Apply per-token scales from A
   bool mUsePerTokenSfA{false};
   // Apply per-token scales from B
@@ -343,20 +380,22 @@ struct GemmOptions {
   bool mUseTwoMmaWarps{false};
   // Whether to unroll the loop by 2x.
   bool mUseUnrollLoop2xForMma{true};
+  // The valid range of M/N/K dimension of GEMM without padding values.
+  // Used to opportunistically remove memory traffic from the padding due to rigid SF shape
+  // constraint or TMA constraint. Such as:
+  // 1. outputDim % (4 * sfBlockSize) == 0; as 4x SFs are packed into 4 bytes
+  // 2. MxFp4 x Fp8 mmaType requires bespoke TMA load which requires hiddenDim % 128 == 0
+  // 3. TMA requires 16B alignment for each row
+  int mValidM{-1};
+  int mValidN{-1};
+  int mValidK{-1};
   // World size for all-reduce.
   int mWorldSize{1};
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-enum class SmVersion { Sm90a, Sm100a, Sm100f, Sm103a };
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline bool isSmVersionBlackwell(SmVersion smVersion) {
-  return smVersion == SmVersion::Sm100a || smVersion == SmVersion::Sm100f ||
-         smVersion == SmVersion::Sm103a;
-}
+using SmVersion = tg::CudaArch;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 //
@@ -365,22 +404,20 @@ inline bool isSmVersionBlackwell(SmVersion smVersion) {
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 struct GemmConfig {
-  // When TRT-LLM Gen is exported to the other frameworks, the TLLM_GEN_EXPORT_INTERFACE must be
-  // defined. In this case, the cubins will be loaded from the provided data and function name.
-  // Otherwise, the kernel will be loaded from the CudaRunner.
-#ifdef TLLM_GEN_EXPORT_INTERFACE
   uint8_t const* mData{nullptr};
-  uint32_t const mSize{0};
-  uint32_t const mSharedMemSize{0};
+  uint32_t mSize{0};
+  uint32_t mSharedMemSize{0};
   char const* mFunctionName{nullptr};
-  uint32_t const mNumThreadsPerCTA{0};
+  uint32_t mNumThreadsPerCTA{0};
   char const* mHash{nullptr};
-#else
+  std::string mGenCfgJsonStr{""};
+  char const* mExecPath{nullptr};
   trtllm::gen::CudaRunner* mCudaRunner{nullptr};
-#endif
+  trtllm::gen::GenCfg* mGenCfg{nullptr};
+  int32_t mInstanceIdx{0};
 
   GemmOptions mOptions{};
-  SmVersion mSm{SmVersion::Sm100a};
+  tg::CudaArch mSm{tg::CudaArch::Sm100a};
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -407,7 +444,7 @@ inline std::string toString(trtllm::gen::MmaKind e) {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-inline std::string dumpOptions(GemmOptions const& options) {
+inline std::string dumpOptions(GemmOptions const& options, bool dumpRuntimeParams = true) {
   std::stringstream ss;
   ss << "mAllReduceAlgo="
      << "gemm::AllReduceAlgo(" << static_cast<int32_t>(options.mAllReduceAlgo) << ")"
@@ -440,6 +477,9 @@ inline std::string dumpOptions(GemmOptions const& options) {
   ss << "mDtypeMmaB="
      << "trtllm::gen::Dtype(" << static_cast<int32_t>(options.mDtypeMmaB) << ")"
      << "," << std::endl;
+  ss << "mEltwiseActType="
+     << "gemm::EltwiseActType(" << static_cast<int32_t>(options.mEltwiseActType) << ")"
+     << "," << std::endl;
   ss << "mEnablesEarlyExit=" << options.mEnablesEarlyExit << "," << std::endl;
   ss << "mEnablesDelayedEarlyExit=" << options.mEnablesDelayedEarlyExit << "," << std::endl;
   ss << "mEnablesGlobalPtxKnobs=" << options.mEnablesGlobalPtxKnobs << "," << std::endl;
@@ -447,6 +487,7 @@ inline std::string dumpOptions(GemmOptions const& options) {
   ss << "mEpilogueLdtmBits=" << options.mEpilogueLdtmBits << "," << std::endl;
   ss << "mEpilogueTileM=" << options.mEpilogueTileM << "," << std::endl;
   ss << "mEpilogueTileN=" << options.mEpilogueTileN << "," << std::endl;
+  ss << "mFuseUtccpWithUtcmma=" << options.mFuseUtccpWithUtcmma << "," << std::endl;
   ss << "mGridTriggerSecondaryA=" << options.mGridTriggerSecondaryA << "," << std::endl;
   ss << "mGridTriggerSecondaryB=" << options.mGridTriggerSecondaryB << "," << std::endl;
   ss << "mGridWaitForPrimaryEarlyExit=" << options.mGridWaitForPrimaryEarlyExit << "," << std::endl;
@@ -454,14 +495,18 @@ inline std::string dumpOptions(GemmOptions const& options) {
   ss << "mGridWaitForPrimaryB=" << options.mGridWaitForPrimaryB << "," << std::endl;
   ss << "mHoistLoadTaskInit=" << options.mHoistLoadTaskInit << "," << std::endl;
   ss << "mHoistMmaTaskTryWaits=" << options.mHoistMmaTaskTryWaits << "," << std::endl;
-  ss << "mK=" << options.mK << "," << std::endl;
+  if (dumpRuntimeParams) {
+    ss << "mK=" << options.mK << "," << std::endl;
+  }
   ss << "mKernelTraits={}"
      << "," << std::endl;
   ss << "mLayoutA=gemm::MatrixLayout(" << static_cast<int32_t>(options.mLayoutA) << ")"
      << "," << std::endl;
   ss << "mLayoutB=gemm::MatrixLayout(" << static_cast<int32_t>(options.mLayoutB) << ")"
      << "," << std::endl;
-  ss << "mM=" << options.mM << "," << std::endl;
+  if (dumpRuntimeParams) {
+    ss << "mM=" << options.mM << "," << std::endl;
+  }
   ss << "mMmaK=" << options.mMmaK << "," << std::endl;
   ss << "mMmaKind="
      << "trtllm::gen::MmaKind(" << static_cast<int32_t>(options.mMmaKind) << ")"
@@ -469,8 +514,16 @@ inline std::string dumpOptions(GemmOptions const& options) {
   ss << "mMmaM=" << options.mMmaM << "," << std::endl;
   ss << "mMmaN=" << options.mMmaN << "," << std::endl;
   ss << "mMockAllReduce=" << options.mMockAllReduce << "," << std::endl;
-  ss << "mN=" << options.mN << "," << std::endl;
+  if (dumpRuntimeParams) {
+    ss << "mN=" << options.mN << "," << std::endl;
+  }
+  ss << "mNumEpilogueWarps=" << options.mNumEpilogueWarps << "," << std::endl;
+  ss << "mNumRegsCastAWarps=" << options.mNumRegsCastAWarps << "," << std::endl;
   ss << "mNumRegsCopySfLdsSttm=" << options.mNumRegsCopySfLdsSttm << "," << std::endl;
+  ss << "mNumRegsPerThreadEpilogueWarp=" << options.mNumRegsPerThreadEpilogueWarp << ","
+     << std::endl;
+  ss << "mNumRegsPerThreadNonEpilogueWarp=" << options.mNumRegsPerThreadNonEpilogueWarp << ","
+     << std::endl;
   ss << "mNumSlicesForSplitK=" << options.mNumSlicesForSplitK << "," << std::endl;
   ss << "mNumSlicesForSliceK=" << options.mNumSlicesForSliceK << "," << std::endl;
   ss << "mNumStages=" << options.mNumStages << "," << std::endl;
@@ -512,6 +565,7 @@ inline std::string dumpOptions(GemmOptions const& options) {
   ss << "mUseDeepSeekFp8=" << options.mUseDeepSeekFp8 << "," << std::endl;
   ss << "mUseHoistTryWaitForCustomMmaSchedule=" << options.mUseHoistTryWaitForCustomMmaSchedule
      << "," << std::endl;
+  ss << "mUseMaxTmemOverlap=" << options.mUseMaxTmemOverlap << "," << std::endl;
   ss << "mUsePerTokenSfA=" << options.mUsePerTokenSfA << "," << std::endl;
   ss << "mUsePerTokenSfB=" << options.mUsePerTokenSfB << "," << std::endl;
   ss << "mUseShuffledMatrixA=" << options.mUseShuffledMatrixA << "," << std::endl;
@@ -519,7 +573,12 @@ inline std::string dumpOptions(GemmOptions const& options) {
   ss << "mUseTwoTmaLoadWarps=" << options.mUseTwoTmaLoadWarps << "," << std::endl;
   ss << "mUseTwoMmaWarps=" << options.mUseTwoMmaWarps << "," << std::endl;
   ss << "mUseUnrollLoop2xForMma=" << options.mUseUnrollLoop2xForMma << "," << std::endl;
-  ss << "mWorldSize=" << options.mWorldSize << std::endl;
+  if (dumpRuntimeParams) {
+    ss << "mValidM=" << options.mValidM << "," << std::endl;
+    ss << "mValidN=" << options.mValidN << "," << std::endl;
+    ss << "mValidK=" << options.mValidK << "," << std::endl;
+    ss << "mWorldSize=" << options.mWorldSize << std::endl;
+  }
   return ss.str();
 }
 
@@ -550,10 +609,12 @@ inline int32_t getShuffleBlockSize(int epilogueTileM) {
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 // Check if the options are valid or not.
-inline bool checkAndUpdateGemmOptions(GemmOptions& options, bool isBlackwell, int tpGrpSize,
+inline bool checkAndUpdateGemmOptions(GemmOptions& options, tg::CudaArch cudaArch, int tpGrpSize,
                                       bool updateOptions = true) {
   options.mWorldSize = tpGrpSize;
 
+  bool isBlackwell = tg::isArchBlackwell(cudaArch);
+
   if (options.mDtypeB == tg::Dtype::Void) {
     if (updateOptions) {
       options.mDtypeB = options.mDtypeA;
@@ -578,11 +639,56 @@ inline bool checkAndUpdateGemmOptions(GemmOptions& options, bool isBlackwell, in
     }
   }
 
+  // If validM/N/K is not specified, then assume the full range of the dimension is valid.
+  if (options.mValidM < 0 || options.mValidN < 0 || options.mValidK < 0) {
+    if (updateOptions) {
+      options.mValidM = options.mValidM < 0 ? options.mM : options.mValidM;
+      options.mValidN = options.mValidN < 0 ? options.mN : options.mValidN;
+      options.mValidK = options.mValidK < 0 ? options.mK : options.mValidK;
+    } else {
+      return false;
+    }
+  }
+
+  // It must not exceed the padded dimensions.
+  if (options.mValidM > options.mM || options.mValidN > options.mN ||
+      options.mValidK > options.mK) {
+    TLLM_LOG_WARNING(
+        "ValidM, ValidN, and ValidK must be less than or equal to M, N, and K respectively.");
+    if (updateOptions) {
+      options.mValidM = std::min(options.mValidM, options.mM);
+      options.mValidN = std::min(options.mValidN, options.mN);
+      options.mValidK = std::min(options.mValidK, options.mK);
+    } else {
+      return false;
+    }
+  }
+
+  // BlockMajorK layout does not support validM, validN, validK parameters
+  if (options.mLayoutA == gemm::MatrixLayout::BlockMajorK ||
+      options.mLayoutB == gemm::MatrixLayout::BlockMajorK) {
+    bool hasValidParams = (options.mValidM != -1 && options.mValidM != options.mM) ||
+                          (options.mValidN != -1 && options.mValidN != options.mN) ||
+                          (options.mValidK != -1 && options.mValidK != options.mK);
+    TLLM_CHECK_ERROR(!hasValidParams,
+                     "BlockMajorK layout does not support validM/validN/validK parameters due to "
+                     "swizzled layout. "
+                     "Found validM=",
+                     options.mValidM, " validN=", options.mValidN, " validK=", options.mValidK);
+  }
+
+#ifdef TLLM_PUBLIC_RELEASE
+  if (options.mDtypeA == tg::Dtype::E2m1 && options.mDtypeMmaA == tg::Dtype::E4m3) {
+    TLLM_CHECK_ERROR(false, "E2m1 x E4m3 is not supported for JIT compile. Use cubins instead.");
+  }
+#endif  // TLLM_PUBLIC_RELEASE
+
   // Check that the A cast is supported.
-  // Currently, we only support {MxFp4, NvFp4} -> Bf16.
+  // Currently, we only support {MxFp4, NvFp4, MxInt4} -> Bf16.
   TLLM_CHECK_ERROR(
       (options.mDtypeA == options.mDtypeMmaA) ||
-          ((options.mDtypeA == tg::Dtype::MxE2m1 || options.mDtypeA == tg::Dtype::E2m1) &&
+          ((options.mDtypeA == tg::Dtype::MxE2m1 || options.mDtypeA == tg::Dtype::E2m1 ||
+            options.mDtypeA == tg::Dtype::MxInt4) &&
            options.mDtypeMmaA == tg::Dtype::Bfloat16) ||
           (options.mDtypeA == tg::Dtype::E2m1 && options.mDtypeMmaA == tg::Dtype::E4m3),
       "Unsupported cast for A: ", tg::dtypeToString(options.mDtypeA), " -> ",
@@ -607,6 +713,9 @@ inline bool checkAndUpdateGemmOptions(GemmOptions& options, bool isBlackwell, in
         options.mDtypeA == tg::Dtype::MxE2m1 && options.mDtypeMmaA == tg::Dtype::Bfloat16,
         "PatchF2fp is only supported for MxFp4 to Bf16 casts.");
   }
+#ifdef TLLM_PUBLIC_RELEASE
+  options.mPatchF2fp = false;
+#endif  // TLLM_PUBLIC_RELEASE
 
   // FIXME: We do not support different dtypes for A and B when not on Blackwell.
   if (!isBlackwell) {
@@ -819,7 +928,7 @@ inline bool checkAndUpdateGemmOptions(GemmOptions& options, bool isBlackwell, in
       (padMultiplierB * tg::dtypeGetNumBits(options.mDtypeB) * options.mK / 8) % 16 == 0,
       "K dimension of B must be aligned to 16 bytes.");
 
-  if (options.mDtypeC == tg::Dtype::E2m1 || options.mDtypeC == tg::Dtype::MxE4m3) {
+  if (tg::dtypeIsBlockFmt(options.mDtypeC)) {
     TLLM_CHECK_ERROR(isBlackwell, "Block scaling is only supported on Blackwell");
 
     TLLM_CHECK_ERROR(
@@ -836,6 +945,10 @@ inline bool checkAndUpdateGemmOptions(GemmOptions& options, bool isBlackwell, in
     int const hiddenGranularity = 4 * tg::dtypeNumEltsPerSf(options.mDtypeC);
     TLLM_CHECK_ERROR(hiddenDim % hiddenGranularity == 0, "Hidden dim (", hiddenDim,
                      ") must be a multiple of ", hiddenGranularity, " for block-scaled outputs.");
+    int const validHiddenDim = options.mTransposeMmaOutput ? options.mValidM : options.mValidN;
+    TLLM_CHECK_ERROR(validHiddenDim % tg::dtypeNumEltsPerSf(options.mDtypeC) == 0,
+                     "Valid hidden dim (", validHiddenDim, ") must be a multiple of ",
+                     tg::dtypeNumEltsPerSf(options.mDtypeC), " for block-scaled outputs.");
     TLLM_CHECK_ERROR(!options.mTransposeMmaOutput || options.mUseShuffledMatrixA,
                      "Transposing block-scaled outputs requires shuffled A.");
   }
@@ -901,8 +1014,8 @@ inline bool checkAndUpdateGemmOptions(GemmOptions& options, bool isBlackwell, in
 
   if (options.mUseShuffledMatrixA) {
     auto const shuffleBlockSize = getShuffleBlockSize(options.mEpilogueTileM);
-    TLLM_CHECK_ERROR(options.mM % shuffleBlockSize == 0,
-                     "M must be a multiple of shuffle block size (", shuffleBlockSize,
+    TLLM_CHECK_ERROR(options.mM % shuffleBlockSize == 0 && options.mValidM % shuffleBlockSize == 0,
+                     "M/validM must be a multiple of shuffle block size (", shuffleBlockSize,
                      ") when useShuffledMatrixA");
   }
 
@@ -1084,9 +1197,9 @@ inline bool checkAndUpdateGemmOptions(GemmOptions& options, bool isBlackwell, in
     // options.mUseTwoMmaWarps = true;
 
     // Make sure the GEMM-K dimension is a multiple of 128 when using DeepSeek FP8.
-    TLLM_CHECK_ERROR(options.mK % 128 == 0,
-                     "GEMM-K must be a multiple of 128 when using DeepSeek Fp8. Found ",
-                     options.mK);
+    TLLM_CHECK_ERROR(options.mK % 128 == 0 && options.mValidK % 128 == 0,
+                     "GEMM-K and validK must be a multiple of 128 when using DeepSeek Fp8. Found ",
+                     options.mK, " and validK=", options.mValidK);
 
     // Check that the output tile N can be processed with the epilogue tile granularity.
     TLLM_CHECK_ERROR((hiddenDimPerOutputTile / 2) % hiddenDimPerEpilogueTile == 0,
@@ -1100,6 +1213,9 @@ inline bool checkAndUpdateGemmOptions(GemmOptions& options, bool isBlackwell, in
                      hiddenDimPerMma, ")");
   }
 
+  TLLM_CHECK_ERROR(options.mNumEpilogueWarps == 4 || options.mNumEpilogueWarps == 8,
+                   "mNumEpilogueWarps has to be either 4 or 8.");
+
   if (options.mSliceK) {
     TLLM_CHECK_ERROR(isBlackwell, "Slice-K is not supported on Hopper");
 
@@ -1191,6 +1307,16 @@ inline bool checkAndUpdateGemmOptions(GemmOptions& options, bool isBlackwell, in
     }
   }
 
+  if (isBlackwell && !options.mUseCustomMmaSchedule && !options.mUseDeepSeekFp8 &&
+      options.mTileScheduler == TileScheduler::Persistent) {
+    if (updateOptions) {
+      options.mUseCustomMmaSchedule = true;
+    } else {
+      TLLM_CHECK_ERROR(false,
+                       "TileScheduler::Persistent and !UseCustomMmaSchedule is not supported.");
+    }
+  }
+
   if (options.mEnablesDelayedEarlyExit && options.mEnablesEarlyExit) {
     TLLM_LOG_WARNING(
         "Only one of early exit and delayed early exit should be enabled. Disabling "
@@ -1253,7 +1379,7 @@ inline bool checkAndUpdateGemmOptions(GemmOptions& options, bool isBlackwell, in
       "At least one matrix must be in k-major layout");
 
   // Some features are currently only support when both matrices are in K-major format
-  if (options.mLayoutB != MatrixLayout::MajorK || options.mLayoutB != MatrixLayout::MajorK) {
+  if (options.mLayoutA != MatrixLayout::MajorK || options.mLayoutB != MatrixLayout::MajorK) {
     TLLM_CHECK_ERROR(isBlackwell, "Non K-major layouts are only supported on Blackwell");
     TLLM_CHECK_ERROR(options.mSplitK == SplitK::None, "Non K-major layouts do not support split K");
   }
@@ -1303,6 +1429,35 @@ inline bool checkAndUpdateGemmOptions(GemmOptions& options, bool isBlackwell, in
                      "Bias is not supported for Meta Fp8");
   }
 
+  if (options.mUseMaxTmemOverlap) {
+    TLLM_CHECK_ERROR(options.mUseTmaStore, "mUseMaxTmemOverlap only works with TMA store");
+    TLLM_CHECK_ERROR(options.mFuseUtccpWithUtcmma,
+                     "mUseMaxTmemOverlap only works with mFuseUtccpWithUtcmma");
+    TLLM_CHECK_ERROR(options.mNumSlicesForSplitK == 1,
+                     "mUseMaxTmemOverlap does not work with splitK");
+    TLLM_CHECK_ERROR(options.mNumSlicesForSliceK == 1,
+                     "mUseMaxTmemOverlap does not work with sliceK");
+    TLLM_CHECK_ERROR(!options.mUseDeepSeekFp8,
+                     "mUseMaxTmemOverlap does not work with mUseDeepSeekFp8");
+    TLLM_CHECK_ERROR(!options.mUseUnrollLoop2xForMma,
+                     "mUseMaxTmemOverlap does not work with mUseUnrollLoop2xForMma");
+  }
+
+  if (options.mNumEpilogueWarps > 4) {
+    TLLM_CHECK_ERROR(options.mUseTmaStore,
+                     "Using more than 4 warps for epilogue only works with TMA store");
+    TLLM_CHECK_ERROR(options.mNumSlicesForSplitK == 1,
+                     "Using more than 4 warps for epilogue does not work with splitK");
+    TLLM_CHECK_ERROR(options.mNumSlicesForSliceK == 1,
+                     "Using more than 4 warps for epilogue does not work with sliceK");
+    TLLM_CHECK_ERROR(!options.mUseDeepSeekFp8,
+                     "Using more than 4 warps for epilogue does not work with mUseDeepSeekFp8");
+
+    auto const numEpilogueWrpGrps = options.mNumEpilogueWarps / 4;
+    TLLM_CHECK_ERROR(options.mTileN % (options.mEpilogueTileN * numEpilogueWrpGrps) == 0,
+                     "TileN must be a multiple of EpilogueTileN * numEpilogueWrpGrps");
+  }
+
   if (updateOptions) {
     // Init kernel traits.
     options.mKernelTraits = KernelTraits(
@@ -1311,6 +1466,7 @@ inline bool checkAndUpdateGemmOptions(GemmOptions& options, bool isBlackwell, in
         options.mTileK, options.mEpilogueTileM, options.mEpilogueTileN, options.mNumStages,
         options.mNumStagesMma, options.mNumSlicesForSplitK, options.mNumSlicesForSliceK,
         options.mSplitK, options.mUseTmaStore, options.mTransposeMmaOutput, options.mAllReduceAlgo,
+        options.mFuseUtccpWithUtcmma, options.mUseMaxTmemOverlap, options.mNumEpilogueWarps,
         options.mTileScheduler == TileScheduler::Persistent, options.mUseDeepSeekFp8,
         options.mUsePerTokenSfA, options.mUsePerTokenSfB,
         /* useTwoCtas*/ options.mClusterDimX == 2, options.mBiasType);
@@ -1321,6 +1477,59 @@ inline bool checkAndUpdateGemmOptions(GemmOptions& options, bool isBlackwell, in
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+inline bool getDoesScaleC(tg::Dtype dtypeC) {
+  // Need to scale/quantize the output C matrix when the output type is Fp8 or NvFp4.
+  return dtypeC == tg::Dtype::E4m3 || dtypeC == tg::Dtype::E2m1;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline bool getDoesScaleAb(tg::Dtype dtypeA, tg::Dtype dtypeB, bool useDeepSeekFp8) {
+  // Need to scale/dequantize the input A/B matrices when the input type is Fp8 or NvFp4 and
+  // DeepSeekFp8 is not used.
+  bool const doesScaleAb{
+      dtypeA == tg::Dtype::E2m1 || dtypeB == tg::Dtype::E2m1 ||
+      ((dtypeA == tg::Dtype::E4m3 || dtypeB == tg::Dtype::E4m3) && !useDeepSeekFp8)};
+  return doesScaleAb;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline bool getKernelDoesScaleC(tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC,
+                                bool useDeepSeekFp8) {
+  // In the Gemm/BatchedGemm kernels, dequantScaleAb and quantScaleC are combined into one single
+  // scaling factor (called scaleC). As a result, we combine the logic for getDoesScaleAb and
+  // getDoesScaleC.
+  return getDoesScaleC(dtypeC) || getDoesScaleAb(dtypeA, dtypeB, useDeepSeekFp8);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Config>
+inline CUresult loadCubinData(CUmodule* module, Config const& config) {
+  // Trtllm links the cubin into the executable while Flashinfer loads the cubin from storage.
+#ifdef TLLM_GEN_EXPORT_FLASHINFER
+#ifdef TLLM_GEN_GEMM_CUBIN_PATH
+  static const std::string tllm_gen_gemm_cubin_path = std::string(TLLM_GEN_GEMM_CUBIN_PATH);
+  const std::string sha256 = config.mHash ? config.mHash : "";
+  std::string fileName = config.mFunctionName;
+  if (!fileName.empty()) {
+    fileName[0] = static_cast<char>(std::toupper(static_cast<unsigned char>(fileName[0])));
+  }
+  const std::string& data = flashinfer::trtllm_cubin_loader::getCubin(
+      tllm_gen_gemm_cubin_path + "/" + fileName + ".cubin", sha256);
+  CUresult result = cuModuleLoadData(module, data.c_str());
+#else
+  static_assert(false, "TLLM_GEN_GEMM_CUBIN_PATH macro is not defined when compiling");
+#endif  // TLLM_GEN_GEMM_CUBIN_PATH
+#else
+  CUresult result = cuModuleLoadData(module, config.mData);
+#endif  // TLLM_GEN_EXPORT_FLASHINFER
+  return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 }  // namespace gemm
 
 #ifdef TLLM_GEN_EXPORT_INTERFACE
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelParams.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelParams.h
index 7e0474bb5f..8094f1490e 100644
--- a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelParams.h
+++ b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelParams.h
@@ -82,8 +82,13 @@ bool useTmaOobOptC(BatchedGemmOptions const& options) {
 
 // Create the TMA shape/stride for A/B/C.
 template <class GemmOptions>
-static auto makeTmaShapeStrideAbc(GemmOptions const& options, int mM, int mN, int mK, int tileM,
-                                  int tileN, int tileK, MatrixType matrixType) {
+static auto makeTmaShapeStrideAbc(GemmOptions const& options, int sizeM, int sizeN, int sizeK,
+                                  int tileM, int tileN, int tileK, MatrixType matrixType,
+                                  int validM = -1, int validN = -1, int validK = -1) {
+  // Default to padded dimensions if not provided.
+  validM = validM < 0 ? sizeM : validM;
+  validN = validN < 0 ? sizeN : validN;
+  validK = validK < 0 ? sizeK : validK;
   // Weights matrix is A if we transpose the output of MMA (to have it M-major).
   // Otherwise, it is B, when the output of MMA is K-major.
   bool const isWeights = (matrixType == MatrixType::MatrixA && options.mTransposeMmaOutput) ||
@@ -96,9 +101,11 @@ static auto makeTmaShapeStrideAbc(GemmOptions const& options, int mM, int mN, in
                             : matrixType == MatrixType::MatrixC ? useTmaOobOptC(options)
                                                                 : false;
 
-  // The outer dimension.
+  // The outer dimension. Uses padded dimensions for strides and valid dimensions for shapes.
   auto numTokens =
-      (matrixType == MatrixType::MatrixA || matrixType == MatrixType::MatrixC) ? mM : mN;
+      (matrixType == MatrixType::MatrixA || matrixType == MatrixType::MatrixC) ? sizeM : sizeN;
+  auto numTokensValid =
+      (matrixType == MatrixType::MatrixA || matrixType == MatrixType::MatrixC) ? validM : validN;
   // The outer dimension tile size.
   auto ctaTileNumTokens =
       (matrixType == MatrixType::MatrixA || matrixType == MatrixType::MatrixC) ? tileM : tileN;
@@ -107,7 +114,8 @@ static auto makeTmaShapeStrideAbc(GemmOptions const& options, int mM, int mN, in
       (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileM : ctaTileNumTokens;
 
   // The inner dimension.
-  auto hiddenSize = (matrixType == MatrixType::MatrixC) ? mN : mK;
+  auto hiddenSize = (matrixType == MatrixType::MatrixC) ? sizeN : sizeK;
+  auto hiddenSizeValid = (matrixType == MatrixType::MatrixC) ? validN : validK;
   // The inner dimension tile size.
   auto ctaTileHiddenSize = (matrixType == MatrixType::MatrixC) ? tileN : tileK;
   // The inner dimension of TMA box shape.
@@ -117,6 +125,7 @@ static auto makeTmaShapeStrideAbc(GemmOptions const& options, int mM, int mN, in
   // Swap matrix C sizes if output is transposed.
   if (matrixType == MatrixType::MatrixC && options.mTransposeMmaOutput) {
     std::swap(numTokens, hiddenSize);
+    std::swap(numTokensValid, hiddenSizeValid);
     std::swap(ctaTileNumTokens, ctaTileHiddenSize);
     std::swap(tileNumTokens, tileHiddenSize);
   }
@@ -125,6 +134,7 @@ static auto makeTmaShapeStrideAbc(GemmOptions const& options, int mM, int mN, in
   // gated activations but not regular activations.
   if (options.mFusedAct && matrixType == MatrixType::MatrixC) {
     hiddenSize /= 2;
+    hiddenSizeValid /= 2;
     tileHiddenSize /= 2;
     ctaTileHiddenSize /= 2;
   }
@@ -134,17 +144,18 @@ static auto makeTmaShapeStrideAbc(GemmOptions const& options, int mM, int mN, in
   // 1, so swap the first two dimension so that the hiddenSize dimension comes first.
 
   // Activations matrix is 2D (sum(divUpMul(M[bi], tileM) for bi in B), K).
-  std::vector<uint64_t> shape = {static_cast<uint64_t>(hiddenSize),
-                                 static_cast<uint64_t>(numTokens)};
+  // Use valid dimensions for shape.
+  std::vector<uint64_t> shape = {static_cast<uint64_t>(hiddenSizeValid),
+                                 static_cast<uint64_t>(numTokensValid)};
   if (useTmaOobOpt /* also implies input/output activation */) {
     // If TMA OOB optimization is used:
     // Shape [hidden, tokens]                      Stride [1, hidden] becomes
     // Shape [hidden, tileN, TmaDimMax, TmaDimMax] Stride [1, hidden, XLargeN - hidden, hidden]
-    shape = {static_cast<uint64_t>(hiddenSize), static_cast<uint64_t>(ctaTileNumTokens),
+    shape = {static_cast<uint64_t>(hiddenSizeValid), static_cast<uint64_t>(ctaTileNumTokens),
              static_cast<uint64_t>(tg::TmaDimMax), static_cast<uint64_t>(tg::TmaDimMax)};
   } else if (isWeights) {
     // If the matrix is a weights matrix, we use 3D logical shape (B, M, K) or (B, N, K).
-    shape = {static_cast<uint64_t>(hiddenSize), static_cast<uint64_t>(numTokens),
+    shape = {static_cast<uint64_t>(hiddenSizeValid), static_cast<uint64_t>(numTokensValid),
              static_cast<uint64_t>(options.mNumBatches)};
   }
 
@@ -177,10 +188,11 @@ static auto makeTmaShapeStrideAbc(GemmOptions const& options, int mM, int mN, in
       stride[1] = numTokens;
       std::swap(tileShape[0], tileShape[1]);
     } else if (layout == gemm::MatrixLayout::BlockMajorK) {
-      // Set shapes based on blocking layout
+      // Set shapes based on blocking layout.
       shape = {static_cast<uint64_t>(options.mBlockK), static_cast<uint64_t>(numTokens),
-               static_cast<uint64_t>(mK / options.mBlockK),
+               static_cast<uint64_t>(sizeK / options.mBlockK),
                static_cast<uint64_t>(options.mNumBatches)};
+      // Strides use padded dimensions
       stride = {1, static_cast<uint64_t>(options.mBlockK),
                 static_cast<uint64_t>(numTokens * options.mBlockK),
                 static_cast<uint64_t>(hiddenSize * numTokens)};
@@ -209,17 +221,6 @@ static auto makeTmaShapeStrideSfAb(int mM, int mN, int mK, MatrixType matrixType
 
   switch (layout) {
     case tg::SfLayout::R128c4: {
-      // The scaling factor tensor packs 128x4 tiles into contiguous 512B blocks.
-      // The 512B block maps to a 32x16B (32x128b) block in TMEM.
-      // See https://nvbugspro.nvidia.com/bug/4165523
-      //
-      // Additionally, we have to meet constraints of TMA that the box dimensions are less
-      // than 256 and boxDim[0] is a multiple of 16B.
-      //
-      // The "logical" tensor is:      [outer,       inner / numEltsPerSf]
-      // The aforementioned format is: [outer / 128, inner / numEltsPerSf / 4,    512]
-      // The shape we use for TMA is:  [outer / 128, inner / numEltsPerSf / 4, 2, 256]
-
       auto shape = std::vector<uint64_t>{
           256, 2, static_cast<uint64_t>(ceilDiv(hiddenSize, numEltsPerSf * 4)),
           static_cast<uint64_t>(ceilDiv(numTokens, 128))};
@@ -294,7 +295,6 @@ static auto makeTmaShapeStrideSfAb(int mM, int mN, int mK, MatrixType matrixType
   }
   return std::make_tuple(std::vector<uint64_t>{}, std::vector<uint64_t>{}, std::vector<uint32_t>{});
 }
-
 template <class GemmOptions_>
 static KernelParams setKernelParams(
     GemmOptions_ const& options, bool const batchM, void const* ptrA, void const* ptrB, void* ptrC,
@@ -390,9 +390,9 @@ static KernelParams setKernelParams(
     params.tileStridePerBatch = options.mM / options.mTileM;
     params.nm = options.mM;
     // Shape/stride for gmem tensor A.
-    auto [shapeA, strideA, tileShapeA] =
-        makeTmaShapeStrideAbc(options, options.mM, options.mN, options.mK, options.mTileM,
-                              options.mTileN, options.mTileK, MatrixType::MatrixA);
+    auto [shapeA, strideA, tileShapeA] = makeTmaShapeStrideAbc(
+        options, options.mM, options.mN, options.mK, options.mTileM, options.mTileN, options.mTileK,
+        MatrixType::MatrixA, options.mValidM, options.mValidN, options.mValidK);
     // Build tma descriptor for A.
     params.tmaA[0] = gemm::buildNdTmaDescriptor(options.mDtypeA, options.mMmaKind, shapeA, strideA,
                                                 tileShapeA, const_cast<void*>(ptrA));
@@ -407,16 +407,23 @@ static KernelParams setKernelParams(
       // Shape/stride for gmem tensor B.
       auto [shapeB, strideB, tileShapeB] = makeTmaShapeStrideAbc(
           options, options.mM, useRouteAct ? options.mNumTokens : inputNumTokens, options.mK,
-          options.mTileM, (useRouteAct ? 1 : options.mTileN), options.mTileK, MatrixType::MatrixB);
+          options.mTileM, (useRouteAct ? 1 : options.mTileN), options.mTileK, MatrixType::MatrixB,
+          options.mValidM, useRouteAct ? options.mNumTokens : inputNumTokens, options.mValidK);
       // Build tma descriptor for B.
       params.tmaB[0] = gemm::buildNdTmaDescriptor(options.mDtypeB, options.mMmaKind, shapeB,
                                                   strideB, tileShapeB, const_cast<void*>(ptrB));
     }
 
     if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE4m3 ||
-        options.mDtypeA == tg::Dtype::MxE2m1) {
-      tg::Dtype const dTypeSf =
-          (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0;
+        options.mDtypeA == tg::Dtype::MxE2m1 || options.mDtypeA == tg::Dtype::MxInt4) {
+      tg::Dtype dTypeSfA{};
+      if (options.mDtypeA == tg::Dtype::E2m1) {
+        dTypeSfA = tg::Dtype::E4m3;
+      } else if (options.mDtypeA == tg::Dtype::MxInt4) {
+        dTypeSfA = tg::Dtype::Bfloat16;
+      } else {
+        dTypeSfA = tg::Dtype::UE8m0;
+      }
 
       // Build TMA descriptor for gmem A block scaling factors.
       auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(
@@ -424,7 +431,7 @@ static KernelParams setKernelParams(
           options.mTileM, options.mTileN, options.mTileK, tg::SfLayout::R128c4,
           options.mSfReshapeFactor,
           options.mSfBlockSizeA.value_or(tg::dtypeNumEltsPerSf(options.mDtypeA)));
-      params.tmaSfA[0] = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfA, strideSfA, tileShapesSfA,
+      params.tmaSfA[0] = gemm::buildSfTmaDescriptor(dTypeSfA, shapeSfA, strideSfA, tileShapesSfA,
                                                     const_cast<void*>(dSfA));
     }
 
@@ -444,9 +451,13 @@ static KernelParams setKernelParams(
         auto numSfsInK = options.mK / numEltsPerSf;
         numSfsInK = ceilDiv(numSfsInK, 16) * 16;
 
+        auto numSfsInValidK = options.mValidK / numEltsPerSf;
+        numSfsInValidK = ceilDiv(numSfsInValidK, 16) * 16;
+
         auto [shapeSfB, strideSfB, tileShapesSfB] = makeTmaShapeStrideAbc(
             options, options.mM, options.mNumTokens, numSfsInK, options.mTileM, 1 /* tileN */,
-            options.mTileK / numEltsPerSf, MatrixType::MatrixB);
+            options.mTileK / numEltsPerSf, MatrixType::MatrixB, options.mValidM, options.mNumTokens,
+            numSfsInValidK);
         params.tmaSfB[0] = gemm::buildNdTmaDescriptor(
             dTypeSf, options.mMmaKind, shapeSfB, strideSfB, tileShapesSfB, const_cast<void*>(dSfB),
             /*doSwizzle*/ true);
@@ -469,15 +480,20 @@ static KernelParams setKernelParams(
     // C is the output activation
     if (options.mUseTmaStore) {
       // Shape/stride for gmem tensor C.
+      // NOTE: Output is *always* sanitized across the whole MNK range. This ensures maximum
+      // compatibility with the next BMM where unwritten part of the output could be polluted by
+      // NaNs.
       auto [shapeC, strideC, tileShapeC] = makeTmaShapeStrideAbc(
           options, options.mM, ctaOffset * options.mTileN, options.mK, options.mTileM,
           options.mTileN, options.mTileK, MatrixType::MatrixC);
       // Build tma descriptor for C.
       params.tmaC[0] = gemm::buildNdTmaDescriptor(options.mDtypeC, tg::MmaKind::Auto, shapeC,
                                                   strideC, tileShapeC, ptrC);
+
     } else {
       params.ptrC = ptrC;
     }
+
   } else {
     // B is the expert
     if (0 != options.mN % options.mTileN) {
@@ -486,9 +502,9 @@ static KernelParams setKernelParams(
     params.tileStridePerBatch = options.mN / options.mTileN;
     params.nm = options.mN;
     // Shape/stride for gmem tensor B.
-    auto [shapeB, strideB, tileShapeB] =
-        makeTmaShapeStrideAbc(options, options.mM, options.mN, options.mK, options.mTileM,
-                              options.mTileN, options.mTileK, MatrixType::MatrixB);
+    auto [shapeB, strideB, tileShapeB] = makeTmaShapeStrideAbc(
+        options, options.mM, options.mN, options.mK, options.mTileM, options.mTileN, options.mTileK,
+        MatrixType::MatrixB, options.mValidM, options.mValidN, options.mValidK);
     // Build tma descriptor for B.
     params.tmaB[0] = gemm::buildNdTmaDescriptor(options.mDtypeB, options.mMmaKind, shapeB, strideB,
                                                 tileShapeB, const_cast<void*>(ptrB));
@@ -499,9 +515,9 @@ static KernelParams setKernelParams(
       // The input is padded:
       // [act0, padding, padding, ... tileM size .., act1, padding, padding, ...]
       auto const inputNumTokens = ctaOffset * options.mTileM;
-      auto [shapeA, strideA, tileShapeA] =
-          makeTmaShapeStrideAbc(options, inputNumTokens, options.mN, options.mK, options.mTileM,
-                                options.mTileN, options.mTileK, MatrixType::MatrixA);
+      auto [shapeA, strideA, tileShapeA] = makeTmaShapeStrideAbc(
+          options, inputNumTokens, options.mN, options.mK, options.mTileM, options.mTileN,
+          options.mTileK, MatrixType::MatrixA, inputNumTokens, options.mValidN, options.mValidK);
       // Build tma descriptor for A.
       params.tmaA[0] = gemm::buildNdTmaDescriptor(options.mDtypeA, options.mMmaKind, shapeA,
                                                   strideA, tileShapeA, const_cast<void*>(ptrA));
@@ -544,6 +560,9 @@ static KernelParams setKernelParams(
     // C is the output activation
     if (options.mUseTmaStore) {
       // Shape/stride for gmem tensor C.
+      // NOTE: Output is *always* sanitized across the whole MNK range. This ensures maximum
+      // compatibility with the next BMM where unwritten part of the output could be polluted by
+      // NaNs.
       auto [shapeC, strideC, tileShapeC] = makeTmaShapeStrideAbc(
           options, ctaOffset * options.mTileM, options.mN, options.mK, options.mTileM,
           options.mTileN, options.mTileK, MatrixType::MatrixC);
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelParamsDecl.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelParamsDecl.h
index 16b4af3149..e11374739f 100644
--- a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelParamsDecl.h
+++ b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelParamsDecl.h
@@ -29,54 +29,6 @@ struct KernelParams {
   // Maximum number of CTAs in the batch-token dimension.
   static constexpr int MaxNumCtas = 2048;
 
-  // NOTE: TMA out-of-bounds optimization for MoE padded tokens:
-  //
-  // Originally the padded tokens is a 2D tensor [hiddenDim, ctaGridDimY * tileN] with stride [1,
-  // hiddenDim] and box size [tileM, tileN] at pointer p. We waste bandwidth bytes since we only
-  // want to load [0, batchEnd) out of the [0, tileN) box size: batchEnd is a runtime variable while
-  // box size needs to be fixed at compile time.
-  //
-  // To deal with this, we reshape the tensor to 3D: [hiddenDim, tileN, ctaGridDimY * tileN] with
-  // stride [1, hiddenDim, hiddenDim] and box size [tileM, tileN, 1]. For the original 2D
-  // tensor,
-  //
-  //   Offset Coords [ : , ctaIdxY * tileN ],
-  //   Box Sizes     [ : , tileN           ],
-  //   Coords Range  [ : , ctaIdxY * tileN : ctaIdxY * tileN + tileN],
-  //
-  // while we only want load the range [ctaIdxY * tileN, ctaIdxY * tileN + batchEnd), 1 <= batchEnd
-  // <= tileN
-  //
-  // For the reshaped 3D tensor,
-  //
-  //   Offset Coords [ : , tileN - batchEnd ,
-  //                       ctaIdxY * tileN + batchEnd ],
-  //   Box Sizes     [ : , tileN            ,
-  //                       1                          ],
-  //   Coords Range  [ : , tileN - batchEnd : min(tileN, 2 * tileN - batchEnd),
-  //                       ctaIdxY * tileN + batchEnd : ctaIdx * tileN + batchEnd + 1],
-  //
-  // while min(tileN, 2 * tileN - batchEnd) always evaluates to tileN. The unwanted tokens are
-  // essentially filtered out by utilizing the OOB feature of TMA. Since the 2nd and 3rd dimension
-  // has the same stride, we end up loading the following (adding the left and right end of the 2nd
-  // and 3rd dimension ranges):
-  //
-  //   Effective 2D Coords Range
-  //     [ : , tileN + ctaIdxY * tileN : tileN + ctaIdxY * tileN + batchEnd],
-  //
-  // This is exactly the same as the original range except for the offset tileN, thus we also need
-  // to offset the pointer in the opposite direction:
-  //
-  //     Ptr (p) -> Ptr (p - tileN * hiddenDim)
-  //
-  // Due to the restrictions of TMA unit, the above operations requires the TMA descriptor and the
-  // underlying buffer be constructed differently:
-  // - Requires valid buffer at (p - tileN * hidden) - needs prepending `tileN` tokens.
-  // - TMA outermost dimension must be extended by `tileN` or loads will OOB in the rightmost side.
-  // The latter is because when batchEnd == tileN, the offset coords in the 3rd dimension becomes
-  // ctaIdxY * tileN + tileN. When ctaIdxY = ctaGridDimY - 1, it becomes ((ctaGridDimY - 1) * tileN
-  // + tileN = ctaGridDimY * tileN which is equal to the 3rd dimension size and will be filtered
-  // out. That's why we need to extend the tensor size by tileN.
   //
   // TMA descriptor for A.
   // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelTraits.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelTraits.h
index 4d79f83c23..d7b0b6b62f 100644
--- a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelTraits.h
+++ b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelTraits.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cassert>
+#include <cstdio>
 #include <stdexcept>
 
 #include "Enums.h"
@@ -162,9 +163,12 @@ class KernelTraits {
                int32_t epilogueTileN, int32_t numStages, int32_t numStagesMma,
                int32_t numSlicesForSplitK, int32_t numSlicesForSliceK, SplitK splitK,
                bool useTmaStore, bool transposeMmaOutput, AllReduceAlgo allReduceAlgo,
+               bool fuseUtccpWithUtcmma, bool useMaxTmemOverlap, int32_t numEpilogueWarps,
                bool usePersistentScheduler, bool useDeepSeekFp8, bool usePerTokenSfA,
                bool usePerTokenSfB, bool useTwoCtas, BiasType biasType)
-      : mMmaKind{mmaKind} {
+      : mMmaKind{mmaKind},
+        mFuseUtccpWithUtcmma{fuseUtccpWithUtcmma},
+        mUseMaxTmemOverlap{useMaxTmemOverlap} {
     //
     // SMEM
     //
@@ -271,6 +275,10 @@ class KernelTraits {
           extraGmemCMultiplier = 0;
         }
 
+        if (numEpilogueWarps) {
+          extraGmemCMultiplier *= numEpilogueWarps / 4;
+        }
+
         // Number of bytes to store the output in smem.
         auto const numBytesSmemStoreC = usesSmemForGmemC
                                             ? extraGmemCMultiplier * epilogueTileM * epilogueTileN *
@@ -281,6 +289,7 @@ class KernelTraits {
         // gmemC reuses loadAb memory for split-K in DSMEM.
         // Epilogue1 does not reuse and continues after the memory allocated Epilogue0
         // NOTE: we can always reuse loadAb SMEM as long as we don't have persistent scheduler.
+
         auto const reuseFirstChunksSmemStoreC =
             doesSplitKUseDsmem(splitK) && resIdx == 0 && !usePersistentScheduler;
 
@@ -418,8 +427,11 @@ class KernelTraits {
       std::vector<std::string> tmemChunkNames;
       // Matrix D
       {
+        // Two set of TMEM resources for D share epilogueTileN columns,
+        //  | set0:epiTileN0 | set0:epiTileN1/set1:epiTileN0 | set1:epiTileN1 |
+        auto const numCols = mUseMaxTmemOverlap ? 2 * tileN - epilogueTileN : tileN;
         // Number of columns for accumulators.
-        auto const numTmemColsD = numSlicesForSliceK * tileN * numStagesMma *
+        auto const numTmemColsD = numSlicesForSliceK * numCols * numStagesMma *
                                   tg::dtypeGetNumBits(dtypeAcc) /
                                   tg::dtypeGetNumBits(tg::Dtype::UInt32);
         // Number of columns for D alignment.
@@ -466,9 +478,9 @@ class KernelTraits {
         auto const numTmemColsSfA =
             useConstSfA
                 ? tg::roundUp((tileK / 64) * tg::getTmemColStridePerGroup(tileM, mmaK), 4)
-                : (useBlockScalingA
-                       ? ((tileK / 64) * tg::getTmemColStridePerGroup(tileM, mmaK)) * numStages
-                       : 0);
+                : (useBlockScalingA ? ((tileK / 64) * tg::getTmemColStridePerGroup(tileM, mmaK)) *
+                                          (mFuseUtccpWithUtcmma ? 1 : numStages)
+                                    : 0);
         // Number of columns for Sf alignment.
         auto const numColsAlignmentSfA = 4;
         // No need to reuse TMEM.
@@ -491,9 +503,9 @@ class KernelTraits {
         auto const numTmemColsSfB =
             useConstSfB
                 ? tg::roundUp((tileK / 64) * tg::getTmemColStridePerGroup(tileN, mmaK), 4)
-                : (useBlockScalingB
-                       ? ((tileK / 64) * tg::getTmemColStridePerGroup(tileN, mmaK)) * numStages
-                       : 0);
+                : (useBlockScalingB ? ((tileK / 64) * tg::getTmemColStridePerGroup(tileN, mmaK)) *
+                                          (mFuseUtccpWithUtcmma ? 1 : numStages)
+                                    : 0);
         // Number of columns for Sf alignment.
         auto const numColsAlignmentSfB = 4;
         // No need to reuse TMEM.
@@ -515,6 +527,10 @@ class KernelTraits {
  public:
   // The MMA kind.
   tg::MmaKind mMmaKind;
+  // Whether fuse Utccp into the MMA task.
+  bool mFuseUtccpWithUtcmma;
+  // Whether use the max TMEM overlap trick.
+  bool mUseMaxTmemOverlap;
   // Helper for SMEM allocation.
   MemAllocatorHelper mSmemAllocatorHelper;
   // Helper for TMEM allocation.
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/TmaDescriptor.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/TmaDescriptor.h
index a1412444ae..fa250f8fe9 100644
--- a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/TmaDescriptor.h
+++ b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/TmaDescriptor.h
@@ -23,8 +23,6 @@
 
 #ifdef TLLM_ENABLE_CUDA
 #include <cuda.h>
-#include <cutlass/cutlass.h>
-#include <cutlass/half.h>
 #endif
 
 namespace batchedGemm {
@@ -57,7 +55,7 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind,
     tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;
   } else if (dtype == tg::Dtype::E2m1) {
     tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B;
-  } else if (dtype == tg::Dtype::MxE2m1) {
+  } else if (dtype == tg::Dtype::MxE2m1 || dtype == tg::Dtype::MxInt4) {
     if (mmaKind == tg::MmaKind::MxFp8Fp6Fp4) {
       padMultiplier = 2;
       tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B;
@@ -156,7 +154,7 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind,
     char const* errorString;
     cuGetErrorString(result, &errorString);
     std::stringstream ss;
-    ss << "Error: Failed to initialize the TMA descriptor " << result << std::endl;
+    ss << "Error: Failed to initialize the TMA descriptor. " << errorString << std::endl;
 
     ss << "tmaFormat: " << static_cast<int>(tmaDataFormat) << " dim: " << dim
        << " gmem: " << gmemAddr << std::endl;
@@ -197,9 +195,11 @@ inline CUtensorMap buildSfTmaDescriptor(tg::Dtype dtype, std::vector<uint64_t> c
                                         std::vector<uint64_t> const& strides,
                                         const std::vector<uint32_t>& tileShapes, void* gmemAddr) {
   CUtensorMap desc{};
-  CUtensorMapDataType tmaDataFormat;
+  CUtensorMapDataType tmaDataFormat{};
   if (dtype == tg::Dtype::E4m3 || dtype == tg::Dtype::UE8m0) {
     tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_UINT8;
+  } else if (dtype == tg::Dtype::Bfloat16) {
+    tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;
   } else {
     std::cerr << "buildSfTmaDescriptor: unexpected dtype " << tg::dtypeToString(dtype) << std::endl;
     assert(false);
@@ -251,7 +251,7 @@ inline CUtensorMap buildSfTmaDescriptor(tg::Dtype dtype, std::vector<uint64_t> c
     char const* errorString;
     cuGetErrorString(result, &errorString);
     std::stringstream ss;
-    ss << "Error: Failed to initialize the TMA descriptor for SF " << errorString << std::endl;
+    ss << "Error: Failed to initialize the TMA descriptor for SF. " << errorString << std::endl;
 
     ss << "tmaFormat: " << static_cast<int>(tmaDataFormat) << " dim: " << dim
        << " gmem: " << gmemAddr << std::endl;
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/trtllm/gen/CommonUtils.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/trtllm/gen/CommonUtils.h
index c7f1020dea..53155c8ffb 100644
--- a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/trtllm/gen/CommonUtils.h
+++ b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/trtllm/gen/CommonUtils.h
@@ -38,8 +38,6 @@ constexpr unsigned long XLargeN = 1UL << 35;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
 template <typename T>
 inline T ceilDiv(T m, T n) {
   return (m + n - T(1)) / n;
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/trtllm/gen/CudaArchDecl.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/trtllm/gen/CudaArchDecl.h
new file mode 100644
index 0000000000..2a1f2dcc78
--- /dev/null
+++ b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/trtllm/gen/CudaArchDecl.h
@@ -0,0 +1,95 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstring>
+#include <string>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Be careful when modifying this file as it is included by the generated kernels. For example, do
+// not add TLLM_CHECK_* constructs in this file. Thanks!
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace batchedGemm {
+
+namespace trtllm {
+namespace gen {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class CudaArch {
+  // Hopper
+  Sm90a = 0,
+  // Blackwell
+  Sm100a,
+  // Blackwell-family
+  Sm100f,
+  // Blackwell Ultra
+  Sm103a,
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline bool isArchHopper(CudaArch cudaArch) { return cudaArch == CudaArch::Sm90a; }
+
+inline bool isArchBlackwell(CudaArch cudaArch) {
+  return cudaArch == CudaArch::Sm100a || cudaArch == CudaArch::Sm100f ||
+         cudaArch == CudaArch::Sm103a;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline std::string cudaArchToString(CudaArch cudaArch, bool isFull = true) {
+  switch (cudaArch) {
+    case CudaArch::Sm90a:
+      return isFull ? "90a" : "90";
+    case CudaArch::Sm100a:
+      return isFull ? "100a" : "100";
+    case CudaArch::Sm100f:
+      return isFull ? "100f" : "100";
+    case CudaArch::Sm103a:
+      return isFull ? "103a" : "103";
+    default:
+      assert(false);
+      return "";
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline CudaArch stringToCudaArch(std::string const& str) {
+  if (str == "90a") {
+    return CudaArch::Sm90a;
+  } else if (str == "100a") {
+    return CudaArch::Sm100a;
+  } else if (str == "100f") {
+    return CudaArch::Sm100f;
+  } else if (str == "103a") {
+    return CudaArch::Sm103a;
+  } else {
+    assert(false);
+    return CudaArch::Sm100a;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace gen
+}  // namespace trtllm
+}  // namespace batchedGemm
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/trtllm/gen/DtypeDecl.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/trtllm/gen/DtypeDecl.h
index 0866256492..355cfba961 100644
--- a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/trtllm/gen/DtypeDecl.h
+++ b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/trtllm/gen/DtypeDecl.h
@@ -70,13 +70,14 @@ enum class Dtype : uint32_t {
   Int64    = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 1u, /*bits*/  64u, /*uid*/ 11u),
   MxE2m1   = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/   4u, /*uid*/ 12u),
   MxE4m3   = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/   8u, /*uid*/ 13u),
-  UE8m0    = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 0u, /*bits*/   8u, /*uid*/ 14u),
-  UInt8    = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/   8u, /*uid*/ 15u),
-  UInt16   = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/  16u, /*uid*/ 16u),
-  UInt32   = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/  32u, /*uid*/ 17u),
-  UInt64   = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/  64u, /*uid*/ 18u),
-  UInt128  = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 128u, /*uid*/ 19u),
-  Void     = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/   0u, /*uid*/ 20u),
+  MxInt4   = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 1u, /*bits*/   4u, /*uid*/ 14u),
+  UE8m0    = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 0u, /*bits*/   8u, /*uid*/ 15u),
+  UInt8    = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/   8u, /*uid*/ 16u),
+  UInt16   = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/  16u, /*uid*/ 17u),
+  UInt32   = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/  32u, /*uid*/ 18u),
+  UInt64   = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/  64u, /*uid*/ 19u),
+  UInt128  = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 128u, /*uid*/ 20u),
+  Void     = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/   0u, /*uid*/ 21u),
 // clang-format on
 
 #undef TLLM_ENCODE_DTYPE
@@ -160,6 +161,8 @@ inline std::string dtypeToString(Dtype dtype) {
       return "MxE4m3";
     case Dtype::MxE2m1:
       return "MxE2m1";
+    case Dtype::MxInt4:
+      return "MxInt4";
     case Dtype::UE8m0:
       return "UE8m0";
     case Dtype::UInt8:
@@ -201,6 +204,7 @@ inline int dtypeNumEltsPerSf(Dtype dtype) {
       return 16;
     case Dtype::MxE2m1:
     case Dtype::MxE4m3:
+    case Dtype::MxInt4:
       return 32;
     default:
       assert(false);
@@ -218,6 +222,8 @@ inline Dtype dtypeGetBlockSfType(Dtype dtype) {
     case Dtype::MxE2m1:
     case Dtype::MxE4m3:
       return Dtype::UE8m0;
+    case Dtype::MxInt4:
+      return Dtype::Bfloat16;
     default:
       assert(false);
       return Dtype::Void;
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/trtllm/gen/SfLayoutDecl.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/trtllm/gen/SfLayoutDecl.h
index 965bb1b7b8..56b537ff42 100644
--- a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/trtllm/gen/SfLayoutDecl.h
+++ b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/trtllm/gen/SfLayoutDecl.h
@@ -63,8 +63,6 @@ enum class SfLayout {
   // |  1,0 |  1,1 |  1,2 |  1,3 | 33,0 | 33,1 | 33,2 | 33,3 | ... |  97,3 |
   // |  ... |  ... |  ... |  ... |  ... |  ... |  ... |  ... | ... |   ... |
   // | 31,0 | 31,1 | 31,2 | 31,3 | 63,0 | 63,1 | 63,2 | 63,3 | ... | 127,3 |
-  // See https://nvbugspro.nvidia.com/bug/4165523
-  //
   // I.e., the SF buffer is a tensor [⌈m/128⌉, ⌈n/b/4⌉, 32, 4, 4]
   // The SF for the element (i, j) is stored at (i/128, j/b/4, i%32, (i%128)/32, (j/b)%4).
   R128c4,
diff --git a/include/flashinfer/trtllm/fmha/fmhaKernels.cuh b/include/flashinfer/trtllm/fmha/fmhaKernels.cuh
index d3e2b89c85..7fb695ed6d 100644
--- a/include/flashinfer/trtllm/fmha/fmhaKernels.cuh
+++ b/include/flashinfer/trtllm/fmha/fmhaKernels.cuh
@@ -96,14 +96,15 @@ class TllmGenFmhaKernel {
   inline uint64_t hashID(int qkvLayout, int maskType, int kernelType, int scheduler,
                          int multiCtasKvMode, int headDimPerCtaV, int headDimQk, int headDimV,
                          int tileSizeKv, int numTokensPerPage, int maxNumHeadsQPerKvInCta,
-                         bool reuseSmemKForV, bool uses2CtaMma) const {
+                         bool reuseSmemKForV, bool uses2CtaMma, bool sparseMla) const {
     FLASHINFER_CHECK((headDimPerCtaV >= 32) && (headDimQk >= 32) && (headDimV >= 32) &&
-                         (headDimPerCtaV <= 2048) && (headDimQk <= 2048) && (headDimV <= 2048) &&
-                         (numTokensPerPage <= 128),
-                     "Expect (32 <= headDim <= 2048) && (numTokensPerPage <= 128), "
-                     "got headDimPerCtaV=%d, headDimQk=%d, "
-                     "headDimV=%d, numTokensPerPage=%d",
-                     headDimPerCtaV, headDimQk, headDimV, numTokensPerPage);
+                         (headDimPerCtaV <= 1024) && (headDimQk <= 1024) && (headDimV <= 1024),
+                     "Expect (32 <= headDim <= 1024), got headDimPerCtaV=%d, headDimQk=%d, "
+                     "headDimV=%d",
+                     headDimPerCtaV, headDimQk, headDimV);
+    // The numTokensPerPage must be power of 2.
+    FLASHINFER_CHECK((numTokensPerPage & (numTokensPerPage - 1)) == 0,
+                     "The numTokensPerPage must be power of 2.");
     FLASHINFER_CHECK(maxNumHeadsQPerKvInCta <= 128,
                      "The maxNumHeadsQPerKvInCta <= 128 is required.");
     FLASHINFER_CHECK(tileSizeKv == 64 || tileSizeKv == 128, "The tileSizeKv must be 64 or 128.");
@@ -113,25 +114,26 @@ class TllmGenFmhaKernel {
     // Bit 8  - 11: kernelType.
     // Bit 12 - 15: tileScheduler.
     // Bit 16 - 17: multiCtasKvMode.
-    // Bit 18 - 24: (headDimPerCtaV >> 5).
-    // Bit 25 - 31: (headDimQk >> 5).
-    // Bit 32 - 38: (headDimV >> 5).
-    // Bit 39 - 40: (tileSizeKv >> 6).
-    // Bit 41 - 48: numTokensPerPage.
+    // Bit 18 - 25: (headDimPerCtaV >> 3).
+    // Bit 26 - 33: (headDimQk >> 3).
+    // Bit 34 - 41: (headDimV >> 3).
+    // Bit 42 - 43: (tileSizeKv >> 6).
+    // Bit 44 - 48: (log2(numTokensPerPage)).
     // Bit 49 - 56: maxNumHeadsQPerKvInCta.
     // Bit 57 - 57: reuseSmemKForV.
     // Bit 58 - 58: uses2CtaMma.
+    // Bit 59 - 59: sparseMla.
     return (static_cast<uint64_t>(qkvLayout) << 0) | (static_cast<uint64_t>(maskType) << 4) |
            (static_cast<uint64_t>(kernelType) << 8) | (static_cast<uint64_t>(scheduler) << 12) |
            (static_cast<uint64_t>(multiCtasKvMode) << 16) |
-           (static_cast<uint64_t>(headDimPerCtaV >> 5) << 18) |
-           (static_cast<uint64_t>(headDimQk >> 5) << 25) |
-           (static_cast<uint64_t>(headDimV >> 5) << 32) |
-           (static_cast<uint64_t>(tileSizeKv >> 6) << 39) |
-           (static_cast<uint64_t>(numTokensPerPage) << 41) |
+           (static_cast<uint64_t>(headDimPerCtaV >> 3) << 18) |
+           (static_cast<uint64_t>(headDimQk >> 3) << 26) |
+           (static_cast<uint64_t>(headDimV >> 3) << 34) |
+           (static_cast<uint64_t>(tileSizeKv >> 6) << 42) |
+           (static_cast<uint64_t>(log2(numTokensPerPage)) << 44) |
            (static_cast<uint64_t>(maxNumHeadsQPerKvInCta) << 49) |
            (static_cast<uint64_t>(reuseSmemKForV) << 57) |
-           (static_cast<uint64_t>(uses2CtaMma) << 58);
+           (static_cast<uint64_t>(uses2CtaMma) << 58) | (static_cast<uint64_t>(sparseMla) << 59);
   }
 
   uint64_t hashID(KernelMeta const& kernelMeta) const {
@@ -140,7 +142,7 @@ class TllmGenFmhaKernel {
                   kernelMeta.mHeadDimPerCtaV, kernelMeta.mHeadDimQk, kernelMeta.mHeadDimV,
                   kernelMeta.mTileSizeKv, kernelMeta.mNumTokensPerPage,
                   kernelMeta.mMaxNumHeadsQPerKvInCta, kernelMeta.mReuseSmemKForV,
-                  kernelMeta.m2CtaMma);
+                  kernelMeta.m2CtaMma, kernelMeta.mSparseMla);
   }
 
   std::pair<bool, std::string> checkIfKernelExist(RunnerParams const& params) const {
@@ -331,6 +333,10 @@ class TllmGenFmhaKernel {
     if (isMultiCtasKvEnabled(selectKernelParams.mMultiCtasKvMode)) {
       // The maximum attention window (the maximum number of tokensKv that will be attended to).
       int maxAttentionWindow{params.mMaxSeqLenKv};
+      // The sparseMla only selects topK tokensKv.
+      if (params.mSparseMla) {
+        maxAttentionWindow = std::min(params.mMaxSeqLenKv, params.mSparseMlaTopK);
+      }
       // Some of the tilesKv will be skipped if the sliding window attention or chunked attention is
       // used.
       if (isSlidingOrChunkedCausalMask(selectKernelParams.mMaskType)) {
@@ -363,7 +369,8 @@ class TllmGenFmhaKernel {
         // Need to select a different kernel.
         selectKernelParams.mSelectNewKernel = true;
       } else if (totalNumCtas < params.mMultiProcessorCount && isMlaGenKernel(params) &&
-                 selectKernelParams.mTileSizeKv == 128 && getEnvUseTileSizeKv64ForTrtllmGen()) {
+                 !params.mSparseMla && selectKernelParams.mTileSizeKv == 128 &&
+                 getEnvUseTileSizeKv64ForTrtllmGen()) {
         // Use smaller tileSizeKv to fully utilize the SMs.
         selectKernelParams.mTileSizeKv = 64;
         // Need to select a different kernel.
@@ -459,13 +466,15 @@ class TllmGenFmhaKernel {
       // We use the low-latency kernel (SwapsMmaAbForGeneration with tileSizeQ = 16) when any of the
       // following conditions are met:
       // 1. The number of headsQPerKv is <= 32.
-      // 2. The seqLenPerCtaKv <= 1024 based on the benchmark results (this might be fine-tuned
+      // 2. The number of headsQPerKv is < 128 for sparseMla.
+      // 3. The seqLenPerCtaKv <= 1024 based on the benchmark results (this might be fine-tuned
       // later) and
       //    the numCtas (after splitting the heads across multiple CTAs) <=
       //    params.mMultiProcessorCount.
 
       // Check the conditions.
-      if (params.mNumHeadsQPerKv <= 32 || useSwapsMmaAbMlaGenKernel(params)) {
+      if (params.mNumHeadsQPerKv <= 32 || (params.mSparseMla && params.mNumHeadsQPerKv < 128) ||
+          useSwapsMmaAbMlaGenKernel(params)) {
         kernelType = FmhaKernelType::SwapsMmaAbForGeneration;
       } else {
         // Otherwise, we use the high-throughput kernel.
@@ -474,6 +483,10 @@ class TllmGenFmhaKernel {
         if (isMultiCtasKvEnabled(selectKernelParams.mMultiCtasKvMode)) {
           selectKernelParams.mMultiCtasKvMode = MultiCtasKvMode::GmemReductionWithSeparateKernel;
         }
+        // The keepsMmaAbForGeneration sparseMla kernels only support numHeadsQPerKv = 128.
+        FLASHINFER_CHECK(
+            !params.mSparseMla || params.mNumHeadsQPerKv == 128,
+            "The keepsMmaAbForGeneration sparseMla kernels only support numHeadsQPerKv = 128");
         // The 2CTA keepsMmaAbForGeneration kernel is used when the numHeadsQPerKv is 128.
         if (params.mNumHeadsQPerKv == 128) {
           selectKernelParams.mUses2CtaMma = true;
@@ -522,8 +535,16 @@ class TllmGenFmhaKernel {
           "Sliding window attention and chunked attention should not be used together");
       selectKernelParams.mMaskType = TrtllmGenAttentionMaskType::SlidingOrChunkedCausal;
     }
-    // NumTokensPerPage is set to 0 when not selecting pagedKv-layout kernels.
-    int numTokensPerPage = (!isPagedKv(params.mQkvLayout)) ? 0 : params.mNumTokensPerPage;
+
+    // The number of tokens per page.
+    int numTokensPerPage = params.mNumTokensPerPage;
+    // SparseMla kernels use a fixed numTokensPerPage = 1.
+    if (params.mSparseMla) {
+      numTokensPerPage = 1;
+    } else if (!isPagedKv(params.mQkvLayout)) {
+      // NumTokensPerPage is set to 0 when not selecting pagedKv-layout kernels.
+      numTokensPerPage = 0;
+    }
 
     // Debug info.
     std::string info =
@@ -540,7 +561,8 @@ class TllmGenFmhaKernel {
         ", numTokensPerPage=" + std::to_string(numTokensPerPage) +
         ", maxNumHeadsQPerKvInCta=" + std::to_string(maxNumHeadsQPerKvInCta) +
         ", reuseSmemKForV=" + std::to_string(selectKernelParams.mReuseSmemKForV) +
-        ", uses2CtaMma=" + std::to_string(selectKernelParams.mUses2CtaMma);
+        ", uses2CtaMma=" + std::to_string(selectKernelParams.mUses2CtaMma) +
+        ", sparseMla=" + std::to_string(params.mSparseMla);
     IKL_LOG_DEBUG(
         "Searching for kernel traits (%d available) in TllmGenFmhaKernel(%s, %s, %s, %d) %s",
         getNumLoadedKernels(), toStr(mDtypeQ), toStr(mDtypeKv), toStr(mDtypeOut), mSM,
@@ -552,7 +574,8 @@ class TllmGenFmhaKernel {
                static_cast<int>(selectKernelParams.mMultiCtasKvMode),
                selectKernelParams.mHeadDimPerCtaV, params.mHeadDimQk, params.mHeadDimV,
                selectKernelParams.mTileSizeKv, numTokensPerPage, maxNumHeadsQPerKvInCta,
-               selectKernelParams.mReuseSmemKForV, selectKernelParams.mUses2CtaMma),
+               selectKernelParams.mReuseSmemKForV, selectKernelParams.mUses2CtaMma,
+               params.mSparseMla),
         info);
   }
 
diff --git a/include/flashinfer/trtllm/fmha/fmhaRunnerParams.h b/include/flashinfer/trtllm/fmha/fmhaRunnerParams.h
index b05ce51ae3..ab48bc04cd 100755
--- a/include/flashinfer/trtllm/fmha/fmhaRunnerParams.h
+++ b/include/flashinfer/trtllm/fmha/fmhaRunnerParams.h
@@ -287,6 +287,10 @@ struct TllmGenFmhaRunnerParams {
   float mScaleSfKv;
   // The SF scale for output.
   float mScaleSfO;
+  // Whether to use sparse MLA.
+  bool mSparseMla;
+  // The top k value for sparse MLA.
+  int mSparseMlaTopK;
   // The cuda stream.
   cudaStream_t stream;
   // Whether to enable PDL (Programmatic Dependent Launch).
diff --git a/include/flashinfer/trtllm/fmha/kernelParams.h b/include/flashinfer/trtllm/fmha/kernelParams.h
index 57adc57914..6e62c05543 100644
--- a/include/flashinfer/trtllm/fmha/kernelParams.h
+++ b/include/flashinfer/trtllm/fmha/kernelParams.h
@@ -104,6 +104,8 @@ struct KernelParams {
   // The sequence lengths for K/V. Required by pagedKv kernels to avoid unnecessary computation
   // based on (ptrCumSeqLensKv[batchIdx + 1] - ptrCumSeqLensKv[batchIdx]).
   int32_t const* ptrSeqLensKv;
+  // The reserved memory buffer.
+  int32_t* ptrReservedMem;
   // The softmax stats buffer.
   float2* ptrSoftmaxStats;
 
@@ -139,6 +141,8 @@ struct KernelParams {
   int64_t mNumHiddenEltsO;
   // The total number of pages in the paged-kv memory pool.
   int32_t mNumPagesInMemPool;
+  // The number of tokens per page (used if dynamic numTokensPerPage is enabled).
+  int32_t mNumTokensPerPageLog2;
   // The output scale for FP8 quantization.
   float mOutputScale;
   // The scaling factor for softmax (multiplied by log2 to use faster exp2).
@@ -147,11 +151,15 @@ struct KernelParams {
   float mScaleSfKv;
   // The SF scale for O.
   float mScaleSfO;
+  // The reserved parameter.
+  float mReservedParam;
   // The start token index in SF tensor. Used for FP4 SF offset calculation in generation phase
   // kernel when inflight batching is enabled in TRT-LLM.
   int32_t mStartTokenIdxSfO;
   // The sum of sequence lengths for Q and K/V.
   int32_t mSumOfSeqLensQ, mSumOfSeqLensKv;
+  // The sparseMla topK value.
+  int32_t mSparseMlaTopK;
   // The flag to use block sparse attention.
   bool mUseBlockSparseAttention;
 
@@ -323,6 +331,12 @@ struct KernelParams {
       strideHeads = options.vStrideHeads;
       strideBatch = options.vStrideBatch;
     }
+
+    // Ragged layout has no batch stride; reset negative overflow to 0 for TMA descriptor.
+    if (!isPagedKv(options.mQkvLayout) && !isContiguousKv(options.mQkvLayout) && strideBatch < 0) {
+      strideBatch = 0;
+    }
+
     // The 3 strides (the other ones are 1 and 0).
     return std::make_tuple(strideKeysVals, strideHeads, strideBatch);
   }
@@ -478,8 +492,8 @@ struct KernelParams {
 
     // Check shape must be in range [1, 2^32]
     int32_t dim = shapes.size();
-    // Max five dimension and min 3 dimension.
-    FLASHINFER_CHECK((dim <= 5) && (dim >= 3));
+    // Max five dimension and min 2 dimension.
+    FLASHINFER_CHECK((dim <= 5) && (dim >= 2));
     // Check shape range.
     for (int32_t ii = 0; ii < dim; ++ii) {
       FLASHINFER_CHECK(shapes[ii] >= (uint64_t(1)));        // Size must be min 1
@@ -537,6 +551,8 @@ struct KernelParams {
                                       int32_t maxNumCtasQ, int32_t maxNumCtasKv) {
     // Create the return struct.
     KernelParams params;
+    // Memset the kernel parameters to 0.
+    memset(&params, 0, sizeof(KernelParams));
 
     // Get the device pointers for TMA descriptors.
     auto [qPtr, kPtr, vPtr] = getDevicePtrs(options, get_size_in_bytes(kernelMeta.mDataTypeKv));
@@ -587,6 +603,16 @@ struct KernelParams {
     std::vector<uint32_t> tileShapeKv(shapeK.size(), 1);
     tileShapeKv[0] = numEltsInClampedHeadDimKv / numEltsDivisor;
     tileShapeKv[1] = numKeysPerTile;
+
+    // If sparse MLA is enabled, the shape and stride for K need to be updated for 2D layout
+    // (numTokensKvInPagedKv, headDimQk).
+    if (options.mSparseMla) {
+      shapeK = std::vector<uint64_t>{static_cast<uint64_t>(options.mHeadDimQk),
+                                     static_cast<uint64_t>(INT_MAX)};
+      strideK = std::vector<uint64_t>{1, static_cast<uint64_t>(options.mHeadDimQk)};
+      tileShapeKv[1] = 1;
+    }
+
     // Build tma descriptor for K.
     params.tmaK_ = buildNdTmaDescriptor(options, kernelMeta.mDataTypeKv, shapeK, strideK,
                                         tileShapeKv, const_cast<void*>(kPtr),
@@ -681,6 +707,16 @@ struct KernelParams {
       // Default 0 means that chunked attention is disabled.
       params.mChunkedAttentionSizeLog2 = 0;
     }
+
+    // Compute the log of numTokensPerPage
+    int32_t numTokensPerPageLog2{-1};
+    if (isPagedKv(options.mQkvLayout)) {
+      FLASHINFER_CHECK((options.mNumTokensPerPage & (options.mNumTokensPerPage - 1)) == 0,
+                       "NumTokensPerPage must be power of 2");
+      numTokensPerPageLog2 = (int)log2f((float)options.mNumTokensPerPage);
+    }
+    params.mNumTokensPerPageLog2 = numTokensPerPageLog2;
+
     params.mMaxSeqLenQ = options.mMaxSeqLenQ;
     params.mMaxSeqLenKv = options.mMaxSeqLenKv;
     params.mMaxNumCtasQ = maxNumCtasQ;
@@ -695,12 +731,16 @@ struct KernelParams {
     params.mNumHeadsKv = options.mNumHeadsKv;
     params.mNumHeadsQPerKv = options.mNumHeadsQPerKv;
     params.mNumHiddenEltsO = options.mNumHeadsQ * options.mHeadDimQk;
-    // todo(Yingyi): might take a scalar tensor later
     params.mOutputScale = options.outputScale;
     params.mScaleSoftmaxLog2 = options.scaleSoftmaxLog2;
     params.mStartTokenIdxSfO = options.mSfStartTokenIdx;
     params.mScaleSfKv = options.mScaleSfKv;
     params.ptrSoftmaxStats = options.softmaxStatsPtr;
+    // The sparseMlaTopK needs to be a multiple of 4 as we use 16B cpAsync instructions for the
+    // indices.
+    FLASHINFER_CHECK(!options.mSparseMla || (options.mSparseMlaTopK % 4) == 0,
+                     "SparseMlaTopK must be a multiple of 4");
+    params.mSparseMlaTopK = options.mSparseMlaTopK;
     // TODO: Integrate trtllm block-sparse attention kernels when needed.
     params.mUseBlockSparseAttention = false;
     return params;
diff --git a/include/flashinfer/trtllm/fused_moe/DevKernel.h b/include/flashinfer/trtllm/fused_moe/DevKernel.h
index 50d3baecc7..23abb87a7b 100644
--- a/include/flashinfer/trtllm/fused_moe/DevKernel.h
+++ b/include/flashinfer/trtllm/fused_moe/DevKernel.h
@@ -90,81 +90,125 @@ namespace moe::dev {
     FLASHINFER_WARN("Unsupported dtypeElt");                                                       \
   }
 
-#define LAUNCH_EXPW(data, kernel, numBlocks, numThreads, smemSize, stream)                         \
-  if (data.mDtypeElt == tg::Dtype::Fp16 && data.mDtypeExpW == tg::Dtype::Fp32) {                   \
-    LAUNCH_PDL(data, false, LAUNCH_ESC(cutlass::half_t, float), kernel, numBlocks, numThreads,     \
-               smemSize, stream);                                                                  \
-  } else if (data.mDtypeElt == tg::Dtype::E4m3 && data.mDtypeExpW == tg::Dtype::Fp32) {            \
-    LAUNCH_PDL(data, false, LAUNCH_ESC(cutlass::float_e4m3_t, float), kernel, numBlocks,           \
-               numThreads, smemSize, stream);                                                      \
-  } else if (data.mDtypeElt == tg::Dtype::Bfloat16 && data.mDtypeExpW == tg::Dtype::Fp32) {        \
-    LAUNCH_PDL(data, false, LAUNCH_ESC(cutlass::bfloat16_t, float), kernel, numBlocks, numThreads, \
-               smemSize, stream);                                                                  \
-  } else if (data.mDtypeElt == tg::Dtype::Fp16 && data.mDtypeExpW == tg::Dtype::Bfloat16) {        \
-    LAUNCH_PDL(data, false, LAUNCH_ESC(cutlass::half_t, cutlass::bfloat16_t), kernel, numBlocks,   \
-               numThreads, smemSize, stream);                                                      \
-  } else if (data.mDtypeElt == tg::Dtype::E4m3 && data.mDtypeExpW == tg::Dtype::Bfloat16) {        \
-    LAUNCH_PDL(data, false, LAUNCH_ESC(cutlass::float_e4m3_t, cutlass::bfloat16_t), kernel,        \
-               numBlocks, numThreads, smemSize, stream);                                           \
-  } else if (data.mDtypeElt == tg::Dtype::Bfloat16 && data.mDtypeExpW == tg::Dtype::Bfloat16) {    \
-    LAUNCH_PDL(data, false, LAUNCH_ESC(cutlass::bfloat16_t, cutlass::bfloat16_t), kernel,          \
-               numBlocks, numThreads, smemSize, stream);                                           \
+#define LAUNCH_NUM_TOKENS_PER_CTA(data, type, numTokensPerCta, kernel, numBlocks, numThreads,      \
+                                  smemSize, stream)                                                \
+  if (numTokensPerCta == 4) {                                                                      \
+    LAUNCH_PDL(data, false, LAUNCH_ESC(type, 4), kernel, numBlocks, numThreads, smemSize, stream); \
+  } else if (numTokensPerCta == 2) {                                                               \
+    LAUNCH_PDL(data, false, LAUNCH_ESC(type, 2), kernel, numBlocks, numThreads, smemSize, stream); \
+  } else if (numTokensPerCta == 1) {                                                               \
+    LAUNCH_PDL(data, false, LAUNCH_ESC(type, 1), kernel, numBlocks, numThreads, smemSize, stream); \
+  } else {                                                                                         \
+    FLASHINFER_WARN("Unsupported numTokensPerCta");                                                \
+  }
+
+#define LAUNCH_ACTIVATION(data, kernel, numTokensPerCta, numBlocks, numThreads, smemSize, stream) \
+  if (data.mDtypeElt == tg::Dtype::Fp16) {                                                        \
+    LAUNCH_NUM_TOKENS_PER_CTA(data, cutlass::half_t, numTokensPerCta, kernel, numBlocks,          \
+                              numThreads, smemSize, stream);                                      \
+  } else if (data.mDtypeElt == tg::Dtype::E4m3) {                                                 \
+    LAUNCH_NUM_TOKENS_PER_CTA(data, cutlass::float_e4m3_t, numTokensPerCta, kernel, numBlocks,    \
+                              numThreads, smemSize, stream);                                      \
+  } else if (data.mDtypeElt == tg::Dtype::Bfloat16) {                                             \
+    LAUNCH_NUM_TOKENS_PER_CTA(data, cutlass::bfloat16_t, numTokensPerCta, kernel, numBlocks,      \
+                              numThreads, smemSize, stream);                                      \
+  } else {                                                                                        \
+    FLASHINFER_WARN("Unsupported dtypeElt");                                                      \
+  }
+
+#define LAUNCH_EXPW(data, kernel, topK, numBlocks, numThreads, smemSize, stream)                  \
+  if (data.mDtypeElt == tg::Dtype::Fp16 && data.mDtypeExpW == tg::Dtype::Fp32) {                  \
+    LAUNCH_PDL(data, false, LAUNCH_ESC(cutlass::half_t, float, topK), kernel, numBlocks,          \
+               numThreads, smemSize, stream);                                                     \
+  } else if (data.mDtypeElt == tg::Dtype::E4m3 && data.mDtypeExpW == tg::Dtype::Fp32) {           \
+    LAUNCH_PDL(data, false, LAUNCH_ESC(cutlass::float_e4m3_t, float, topK), kernel, numBlocks,    \
+               numThreads, smemSize, stream);                                                     \
+  } else if (data.mDtypeElt == tg::Dtype::Bfloat16 && data.mDtypeExpW == tg::Dtype::Fp32) {       \
+    LAUNCH_PDL(data, false, LAUNCH_ESC(cutlass::bfloat16_t, float, topK), kernel, numBlocks,      \
+               numThreads, smemSize, stream);                                                     \
+  } else if (data.mDtypeElt == tg::Dtype::Fp16 && data.mDtypeExpW == tg::Dtype::Bfloat16) {       \
+    LAUNCH_PDL(data, false, LAUNCH_ESC(cutlass::half_t, cutlass::bfloat16_t, topK), kernel,       \
+               numBlocks, numThreads, smemSize, stream);                                          \
+  } else if (data.mDtypeElt == tg::Dtype::E4m3 && data.mDtypeExpW == tg::Dtype::Bfloat16) {       \
+    LAUNCH_PDL(data, false, LAUNCH_ESC(cutlass::float_e4m3_t, cutlass::bfloat16_t, topK), kernel, \
+               numBlocks, numThreads, smemSize, stream);                                          \
+  } else if (data.mDtypeElt == tg::Dtype::Bfloat16 && data.mDtypeExpW == tg::Dtype::Bfloat16) {   \
+    LAUNCH_PDL(data, false, LAUNCH_ESC(cutlass::bfloat16_t, cutlass::bfloat16_t, topK), kernel,   \
+               numBlocks, numThreads, smemSize, stream);                                          \
+  } else {                                                                                        \
+    FLASHINFER_WARN("Unsupported pair");                                                          \
+  }
+
+#define LAUNCH_TOPK_EXPW(data, kernel, numBlocks, numThreads, smemSize, stream) \
+  if (data.topK % 4 == 0) {                                                     \
+    LAUNCH_EXPW(data, kernel, 4, numBlocks, numThreads, smemSize, stream);      \
+  } else if (data.topK % 2 == 0) {                                              \
+    LAUNCH_EXPW(data, kernel, 2, numBlocks, numThreads, smemSize, stream);      \
+  } else {                                                                      \
+    LAUNCH_EXPW(data, kernel, 1, numBlocks, numThreads, smemSize, stream);      \
+  }
+
+#define LAUNCH_TILEN(data, coopLaunch, types, kernel, numBlocks, numThreads, smemSize, stream)     \
+  if (data.mPaddingLog2 > 0) {                                                                     \
+    LAUNCH_PDL(data, coopLaunch, LAUNCH_ESC(types, true), kernel, numBlocks, numThreads, smemSize, \
+               stream);                                                                            \
   } else {                                                                                         \
-    FLASHINFER_WARN("Unsupported pair");                                                           \
+    LAUNCH_PDL(data, coopLaunch, LAUNCH_ESC(types, false), kernel, numBlocks, numThreads,          \
+               smemSize, stream);                                                                  \
   }
 
 #define LAUNCH_ROUTING_LLAMA4(data, coopLaunch, kernel, numBlocks, numThreads, smemSize, stream)   \
   if (data.mDtypeExpW == tg::Dtype::Fp32) {                                                        \
-    LAUNCH_PDL(data, coopLaunch, LAUNCH_ESC(float, float, 128 /* Always 128 for llama4*/), kernel, \
-               numBlocks, numThreads, smemSize, stream);                                           \
+    LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(float, float, 128 /* Always 128 for llama4*/),       \
+                 kernel, numBlocks, numThreads, smemSize, stream);                                 \
   } else if (data.mDtypeExpW == tg::Dtype::Bfloat16) {                                             \
-    LAUNCH_PDL(data, coopLaunch,                                                                   \
-               LAUNCH_ESC(__nv_bfloat16, __nv_bfloat16, 128 /* Always 128 for llama4*/), kernel,   \
-               numBlocks, numThreads, smemSize, stream);                                           \
+    LAUNCH_TILEN(data, coopLaunch,                                                                 \
+                 LAUNCH_ESC(__nv_bfloat16, __nv_bfloat16, 128 /* Always 128 for llama4*/), kernel, \
+                 numBlocks, numThreads, smemSize, stream);                                         \
   } else {                                                                                         \
     FLASHINFER_WARN("Unsupported dtypeExpW");                                                      \
   }
 
-#define LAUNCH_ROUTING_DEEPSEEK_WITH_EXTRA_FLAG(data, coopLaunch, kernel, numBlocks, numThreads,  \
-                                                smemSize, stream, extraFlag, numExperts)          \
-  if (data.mDtypeScore == tg::Dtype::Fp32 && data.mDtypeBias == tg::Dtype::Fp32 &&                \
-      data.mDtypeExpW == tg::Dtype::Fp32) {                                                       \
-    LAUNCH_PDL(data, coopLaunch, LAUNCH_ESC(float, float, float, numExperts, extraFlag), kernel,  \
-               numBlocks, numThreads, smemSize, stream);                                          \
-  } else if (data.mDtypeScore == tg::Dtype::Fp32 && data.mDtypeBias == tg::Dtype::Fp32 &&         \
-             data.mDtypeExpW == tg::Dtype::Bfloat16) {                                            \
-    LAUNCH_PDL(data, coopLaunch, LAUNCH_ESC(float, float, __nv_bfloat16, numExperts, extraFlag),  \
-               kernel, numBlocks, numThreads, smemSize, stream);                                  \
-  } else if (data.mDtypeScore == tg::Dtype::Fp32 && data.mDtypeBias == tg::Dtype::Bfloat16 &&     \
-             data.mDtypeExpW == tg::Dtype::Fp32) {                                                \
-    LAUNCH_PDL(data, coopLaunch, LAUNCH_ESC(float, __nv_bfloat16, float, numExperts, extraFlag),  \
-               kernel, numBlocks, numThreads, smemSize, stream);                                  \
-  } else if (data.mDtypeScore == tg::Dtype::Fp32 && data.mDtypeBias == tg::Dtype::Bfloat16 &&     \
-             data.mDtypeExpW == tg::Dtype::Bfloat16) {                                            \
-    LAUNCH_PDL(data, coopLaunch,                                                                  \
-               LAUNCH_ESC(float, __nv_bfloat16, __nv_bfloat16, numExperts, extraFlag), kernel,    \
-               numBlocks, numThreads, smemSize, stream);                                          \
-  } else if (data.mDtypeScore == tg::Dtype::Bfloat16 && data.mDtypeBias == tg::Dtype::Fp32 &&     \
-             data.mDtypeExpW == tg::Dtype::Fp32) {                                                \
-    LAUNCH_PDL(data, coopLaunch, LAUNCH_ESC(__nv_bfloat16, float, float, numExperts, extraFlag),  \
-               kernel, numBlocks, numThreads, smemSize, stream);                                  \
-  } else if (data.mDtypeScore == tg::Dtype::Bfloat16 && data.mDtypeBias == tg::Dtype::Fp32 &&     \
-             data.mDtypeExpW == tg::Dtype::Bfloat16) {                                            \
-    LAUNCH_PDL(data, coopLaunch,                                                                  \
-               LAUNCH_ESC(__nv_bfloat16, float, __nv_bfloat16, numExperts, extraFlag), kernel,    \
-               numBlocks, numThreads, smemSize, stream);                                          \
-  } else if (data.mDtypeScore == tg::Dtype::Bfloat16 && data.mDtypeBias == tg::Dtype::Bfloat16 && \
-             data.mDtypeExpW == tg::Dtype::Fp32) {                                                \
-    LAUNCH_PDL(data, coopLaunch,                                                                  \
-               LAUNCH_ESC(__nv_bfloat16, __nv_bfloat16, float, numExperts, extraFlag), kernel,    \
-               numBlocks, numThreads, smemSize, stream);                                          \
-  } else if (data.mDtypeScore == tg::Dtype::Bfloat16 && data.mDtypeBias == tg::Dtype::Bfloat16 && \
-             data.mDtypeExpW == tg::Dtype::Bfloat16) {                                            \
-    LAUNCH_PDL(data, coopLaunch,                                                                  \
-               LAUNCH_ESC(__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, numExperts, extraFlag),    \
-               kernel, numBlocks, numThreads, smemSize, stream);                                  \
-  } else {                                                                                        \
-    FLASHINFER_WARN("Unsupported dtypeExpW");                                                     \
+#define LAUNCH_ROUTING_DEEPSEEK_WITH_EXTRA_FLAG(data, coopLaunch, kernel, numBlocks, numThreads,   \
+                                                smemSize, stream, extraFlag, numExperts)           \
+  if (data.mDtypeScore == tg::Dtype::Fp32 && data.mDtypeBias == tg::Dtype::Fp32 &&                 \
+      data.mDtypeExpW == tg::Dtype::Fp32) {                                                        \
+    LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(float, float, float, numExperts, extraFlag), kernel, \
+                 numBlocks, numThreads, smemSize, stream);                                         \
+  } else if (data.mDtypeScore == tg::Dtype::Fp32 && data.mDtypeBias == tg::Dtype::Fp32 &&          \
+             data.mDtypeExpW == tg::Dtype::Bfloat16) {                                             \
+    LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(float, float, __nv_bfloat16, numExperts, extraFlag), \
+                 kernel, numBlocks, numThreads, smemSize, stream);                                 \
+  } else if (data.mDtypeScore == tg::Dtype::Fp32 && data.mDtypeBias == tg::Dtype::Bfloat16 &&      \
+             data.mDtypeExpW == tg::Dtype::Fp32) {                                                 \
+    LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(float, __nv_bfloat16, float, numExperts, extraFlag), \
+                 kernel, numBlocks, numThreads, smemSize, stream);                                 \
+  } else if (data.mDtypeScore == tg::Dtype::Fp32 && data.mDtypeBias == tg::Dtype::Bfloat16 &&      \
+             data.mDtypeExpW == tg::Dtype::Bfloat16) {                                             \
+    LAUNCH_TILEN(data, coopLaunch,                                                                 \
+                 LAUNCH_ESC(float, __nv_bfloat16, __nv_bfloat16, numExperts, extraFlag), kernel,   \
+                 numBlocks, numThreads, smemSize, stream);                                         \
+  } else if (data.mDtypeScore == tg::Dtype::Bfloat16 && data.mDtypeBias == tg::Dtype::Fp32 &&      \
+             data.mDtypeExpW == tg::Dtype::Fp32) {                                                 \
+    LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(__nv_bfloat16, float, float, numExperts, extraFlag), \
+                 kernel, numBlocks, numThreads, smemSize, stream);                                 \
+  } else if (data.mDtypeScore == tg::Dtype::Bfloat16 && data.mDtypeBias == tg::Dtype::Fp32 &&      \
+             data.mDtypeExpW == tg::Dtype::Bfloat16) {                                             \
+    LAUNCH_TILEN(data, coopLaunch,                                                                 \
+                 LAUNCH_ESC(__nv_bfloat16, float, __nv_bfloat16, numExperts, extraFlag), kernel,   \
+                 numBlocks, numThreads, smemSize, stream);                                         \
+  } else if (data.mDtypeScore == tg::Dtype::Bfloat16 && data.mDtypeBias == tg::Dtype::Bfloat16 &&  \
+             data.mDtypeExpW == tg::Dtype::Fp32) {                                                 \
+    LAUNCH_TILEN(data, coopLaunch,                                                                 \
+                 LAUNCH_ESC(__nv_bfloat16, __nv_bfloat16, float, numExperts, extraFlag), kernel,   \
+                 numBlocks, numThreads, smemSize, stream);                                         \
+  } else if (data.mDtypeScore == tg::Dtype::Bfloat16 && data.mDtypeBias == tg::Dtype::Bfloat16 &&  \
+             data.mDtypeExpW == tg::Dtype::Bfloat16) {                                             \
+    LAUNCH_TILEN(data, coopLaunch,                                                                 \
+                 LAUNCH_ESC(__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, numExperts, extraFlag),   \
+                 kernel, numBlocks, numThreads, smemSize, stream);                                 \
+  } else {                                                                                         \
+    FLASHINFER_WARN("Unsupported dtypeExpW");                                                      \
   }
 
 #define LAUNCH_ROUTING_DEEPSEEK_IMPL(data, coopLaunch, kernel, numBlocks, numThreads, smemSize, \
@@ -182,17 +226,17 @@ namespace moe::dev {
 #define LAUNCH_ROUTING_WITH_NUM_EXPERTS(data, coopLaunch, kernel, numBlocks, numThreads, smemSize, \
                                         stream, extraFlag1, numExperts)                            \
   if (data.mDtypeExpW == tg::Dtype::Fp32 && extraFlag1) {                                          \
-    LAUNCH_PDL(data, coopLaunch, LAUNCH_ESC(float, float, numExperts, true), kernel, numBlocks,    \
-               numThreads, smemSize, stream);                                                      \
+    LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(float, float, numExperts, true), kernel, numBlocks,  \
+                 numThreads, smemSize, stream);                                                    \
   } else if (data.mDtypeExpW == tg::Dtype::Fp32) {                                                 \
-    LAUNCH_PDL(data, coopLaunch, LAUNCH_ESC(float, float, numExperts, false), kernel, numBlocks,   \
-               numThreads, smemSize, stream);                                                      \
+    LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(float, float, numExperts, false), kernel, numBlocks, \
+                 numThreads, smemSize, stream);                                                    \
   } else if (data.mDtypeExpW == tg::Dtype::Bfloat16 && extraFlag1) {                               \
-    LAUNCH_PDL(data, coopLaunch, LAUNCH_ESC(__nv_bfloat16, __nv_bfloat16, numExperts, true),       \
-               kernel, numBlocks, numThreads, smemSize, stream);                                   \
+    LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(__nv_bfloat16, __nv_bfloat16, numExperts, true),     \
+                 kernel, numBlocks, numThreads, smemSize, stream);                                 \
   } else if (data.mDtypeExpW == tg::Dtype::Bfloat16) {                                             \
-    LAUNCH_PDL(data, coopLaunch, LAUNCH_ESC(__nv_bfloat16, __nv_bfloat16, numExperts, false),      \
-               kernel, numBlocks, numThreads, smemSize, stream);                                   \
+    LAUNCH_TILEN(data, coopLaunch, LAUNCH_ESC(__nv_bfloat16, __nv_bfloat16, numExperts, false),    \
+                 kernel, numBlocks, numThreads, smemSize, stream);                                 \
   } else {                                                                                         \
     FLASHINFER_WARN("Unsupported dtypeExpW");                                                      \
   }
@@ -225,9 +269,10 @@ struct Data {
   int32_t const* totalNumPaddedTokens;
 };
 
-template <typename Type_, bool UsePdl_>
+template <typename Type_, int32_t NumTokensPerCta_, bool UsePdl_>
 struct KernelParams {
   using Type = Type_;
+  static constexpr int32_t NumTokensPerCta = NumTokensPerCta_;
   static constexpr bool UsePdl = UsePdl_;
 
   Type const* inPtr;
@@ -417,10 +462,11 @@ struct Data {
   int32_t const* totalNumPaddedTokens;
 };
 
-template <typename Type_, typename TypeExpW_, bool UsePdl_>
+template <typename Type_, typename TypeExpW_, int TopKUnrollFactor_, bool UsePdl_>
 struct KernelParams {
   using Type = Type_;
   using TypeExpW = TypeExpW_;
+  static constexpr int TopKUnrollFactor = TopKUnrollFactor_;
   static constexpr bool UsePdl = UsePdl_;
 
   Type const* inPtr;
diff --git a/include/flashinfer/trtllm/fused_moe/RoutingKernel.cuh b/include/flashinfer/trtllm/fused_moe/RoutingKernel.cuh
index dd7d5c474d..d110037269 100644
--- a/include/flashinfer/trtllm/fused_moe/RoutingKernel.cuh
+++ b/include/flashinfer/trtllm/fused_moe/RoutingKernel.cuh
@@ -67,6 +67,24 @@ __host__ __device__ constexpr T divUpMulLog2(T a, T bLog2) {
   return mulLog2<T>(divUpLog2<T>(a, bLog2), bLog2);
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename T>
+__host__ __device__ constexpr T mulTileN(T a, T tileN) {
+  return a * tileN;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename T>
+__host__ __device__ constexpr T divUpTileN(T a, T tileN) {
+  return (a + tileN - 1) / tileN;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename T>
+__host__ __device__ constexpr T divUpMulTileN(T a, T tileN) {
+  return divUpTileN(a, tileN) * tileN;
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 __host__ __device__ constexpr int32_t getBits(int32_t value, int idx) {
@@ -299,7 +317,14 @@ __device__ void routingPermutation(KernelParams params,
   // Compute the runtime config for projections
   // Whether or not an expert is local is taken into account when smemExpertCount is computed
   // so we do not need to take it into account here.
-  const int32_t numCta = divUpLog2<int32_t>(count, params.mPaddingLog2);
+
+  int32_t numCta;
+  if constexpr (KernelParams::isPow2) {
+    numCta = divUpLog2<int32_t>(count, params.mPaddingLog2);
+  } else {
+    numCta = divUpTileN<int32_t>(count, params.mTileTokensDim);
+  }
+
   int32_t ctaOffset;
   int32_t numNonExitingCtas;
   Scan(tempStorage).ExclusiveSum(numCta, ctaOffset, numNonExitingCtas);
@@ -310,21 +335,37 @@ __device__ void routingPermutation(KernelParams params,
       const int32_t localExpertIdx =
           (threadIdx.x - params.mLocalExpertsStartIdx) >> params.mLocalExpertsStrideLog2;
       params.mPtrCtaIdxXyToBatchIdx[ctaOffset + cta] = localExpertIdx;
-      params.mPtrCtaIdxXyToMnLimit[ctaOffset + cta] =
-          min(mulLog2<int32_t>(ctaOffset + cta + 1, params.mPaddingLog2),
-              mulLog2<int32_t>(ctaOffset, params.mPaddingLog2) + count);
+      int32_t mnLimit1;
+      int32_t mnLimit2;
+      if constexpr (KernelParams::isPow2) {
+        mnLimit1 = mulLog2<int32_t>(ctaOffset + cta + 1, params.mPaddingLog2);
+        mnLimit2 = mulLog2<int32_t>(ctaOffset, params.mPaddingLog2) + count;
+      } else {
+        mnLimit1 = mulTileN<int32_t>(ctaOffset + cta + 1, params.mTileTokensDim);
+        mnLimit2 = mulTileN<int32_t>(ctaOffset, params.mTileTokensDim) + count;
+      }
+      params.mPtrCtaIdxXyToMnLimit[ctaOffset + cta] = min(mnLimit1, mnLimit2);
     }
 
     // get the padded offset associated with this expert
-    const int32_t offset = mulLog2<int32_t>(ctaOffset, params.mPaddingLog2);
-
+    int32_t offset;
+    if constexpr (KernelParams::isPow2) {
+      offset = mulLog2<int32_t>(ctaOffset, params.mPaddingLog2);
+    } else {
+      offset = mulTileN<int32_t>(ctaOffset, params.mTileTokensDim);
+    }
     // write expert offsets to shared
     smemExpertOffset[threadIdx.x] = offset + blockExpertOffset;
   }
 
   // write out padded count
   if (clusterBlockRank == 0 && warpIdx == NumWarps - 1 && cute::elect_one_sync()) {
-    const int32_t permutedIdxSize = mulLog2<int32_t>(numNonExitingCtas, params.mPaddingLog2);
+    int32_t permutedIdxSize;
+    if constexpr (KernelParams::isPow2) {
+      permutedIdxSize = mulLog2<int32_t>(numNonExitingCtas, params.mPaddingLog2);
+    } else {
+      permutedIdxSize = mulTileN<int32_t>(numNonExitingCtas, params.mTileTokensDim);
+    }
     params.mPtrPermutedIdxSize[0] = permutedIdxSize;
     params.mPtrNumNonExitingCtas[0] = numNonExitingCtas;
   }
@@ -513,14 +554,25 @@ __global__ void __launch_bounds__(KernelParams::MaxNumExperts)
   // Compute the runtime config for projections
   // Whether or not an expert is local is taken into account when the histogram is computed
   // so we do not need to take it into account here.
-  const int32_t numCta = divUpLog2<int32_t>(count, params.mPaddingLog2);
+  // const int32_t numCta = divUpLog2<int32_t>(count, params.mPaddingLog2);
+  int32_t numCta;
+  if constexpr (KernelParams::isPow2) {
+    numCta = divUpLog2<int32_t>(count, params.mPaddingLog2);
+  } else {
+    numCta = divUpTileN<int32_t>(count, params.mTileTokensDim);
+  }
   int32_t ctaOffset;
   int32_t numNonExitingCtas;
   Scan(tempStorage).ExclusiveSum(numCta, ctaOffset, numNonExitingCtas);
 
   if (threadIdx.x < params.mNumExperts) {
     // Get the padded offset associated with this expert
-    const int32_t offset = mulLog2<int32_t>(ctaOffset, params.mPaddingLog2);
+    int32_t offset;
+    if constexpr (KernelParams::isPow2) {
+      offset = mulLog2<int32_t>(ctaOffset, params.mPaddingLog2);
+    } else {
+      offset = mulTileN<int32_t>(ctaOffset, params.mTileTokensDim);
+    }
 
     // Write expert offsets to shared
     smemExpertOffset[threadIdx.x] = offset;
@@ -532,7 +584,12 @@ __global__ void __launch_bounds__(KernelParams::MaxNumExperts)
   // The first block writes out padded count
   if (blockIdx.x == 0 && warpIdx == KernelParams::MaxNumExperts / WarpSize - 1 &&
       cute::elect_one_sync()) {
-    const int32_t permutedIdxSize = mulLog2<int32_t>(numNonExitingCtas, params.mPaddingLog2);
+    int32_t permutedIdxSize;
+    if constexpr (KernelParams::isPow2) {
+      permutedIdxSize = mulLog2<int32_t>(numNonExitingCtas, params.mPaddingLog2);
+    } else {
+      permutedIdxSize = mulTileN<int32_t>(numNonExitingCtas, params.mTileTokensDim);
+    }
     params.mPtrPermutedIdxSize[0] = permutedIdxSize;
     params.mPtrNumNonExitingCtas[0] = numNonExitingCtas;
   }
@@ -543,9 +600,16 @@ __global__ void __launch_bounds__(KernelParams::MaxNumExperts)
       const int32_t localExpertIdx =
           (threadIdx.x - params.mLocalExpertsStartIdx) >> params.mLocalExpertsStrideLog2;
       params.mPtrCtaIdxXyToBatchIdx[ctaOffset + cta] = localExpertIdx;
-      params.mPtrCtaIdxXyToMnLimit[ctaOffset + cta] =
-          min(mulLog2<int32_t>(ctaOffset + cta + 1, params.mPaddingLog2),
-              mulLog2<int32_t>(ctaOffset, params.mPaddingLog2) + count);
+      int32_t mnLimit1;
+      int32_t mnLimit2;
+      if constexpr (KernelParams::isPow2) {
+        mnLimit1 = mulLog2<int32_t>(ctaOffset + cta + 1, params.mPaddingLog2);
+        mnLimit2 = mulLog2<int32_t>(ctaOffset, params.mPaddingLog2) + count;
+      } else {
+        mnLimit1 = mulTileN<int32_t>(ctaOffset + cta + 1, params.mTileTokensDim);
+        mnLimit2 = mulTileN<int32_t>(ctaOffset, params.mTileTokensDim) + count;
+      }
+      params.mPtrCtaIdxXyToMnLimit[ctaOffset + cta] = min(mnLimit1, mnLimit2);
     }
   }
 
diff --git a/include/flashinfer/trtllm/fused_moe/RoutingKernel.h b/include/flashinfer/trtllm/fused_moe/RoutingKernel.h
index e424d91db0..cae6729368 100644
--- a/include/flashinfer/trtllm/fused_moe/RoutingKernel.h
+++ b/include/flashinfer/trtllm/fused_moe/RoutingKernel.h
@@ -50,7 +50,7 @@ struct DataBase {
   // dim: [mNumTokens * mTopK]
   int32_t* mPtrExpandedIdxToPermutedIdx{nullptr};
   // optional: if `nullptr`, it is not filled
-  // dim: [mNumTokens * mTopK + (mNumExperts << mPaddingLog2) - mNumExperts]
+  // dim: [mTileTokensDim * mTopK + (mNumExperts × mTileTokensDim) - mNumExperts]
   // Note: this array (mPtrPermutedIdxToTokenIdx) is uninitialized
   // Any out-of-bounds values are undefined.
   int32_t* mPtrPermutedIdxToTokenIdx{nullptr};
@@ -93,6 +93,7 @@ struct DataBase {
   int32_t mNumExperts;
   int32_t mTopK;
   int32_t mPaddingLog2;
+  int32_t mTileTokensDim;
 
   /// For expert parallelization
   int32_t mLocalExpertsStartIdx;
@@ -100,11 +101,12 @@ struct DataBase {
   int32_t mNumLocalExperts;
 };
 
-template <typename InputT_, typename OutputT_, int MaxNumExperts_, bool UsePdl_>
+template <typename InputT_, typename OutputT_, int MaxNumExperts_, bool isPow2_, bool UsePdl_>
 struct KernelParamsBase {
   using InputT = InputT_;
   using OutputT = OutputT_;
   static constexpr int MaxNumExperts = MaxNumExperts_;
+  static constexpr bool isPow2 = isPow2_;
   static constexpr bool UsePdl = UsePdl_;
 
   // Public pointer members
@@ -123,7 +125,8 @@ struct KernelParamsBase {
   int32_t mNumTokens = 0;
   int32_t mNumExperts = 0;
 
-  int32_t mPaddingLog2 = 0;
+  int32_t mPaddingLog2 = -1;
+  int32_t mTileTokensDim = 0;
   int32_t mLocalExpertsStartIdx = 0;
   int32_t mLocalExpertsStrideLog2 = 0;
   int32_t mNumLocalExperts = 0;
@@ -146,6 +149,7 @@ struct KernelParamsBase {
     mNumExperts = data.mNumExperts;
 
     mPaddingLog2 = data.mPaddingLog2;
+    mTileTokensDim = data.mTileTokensDim;
     mLocalExpertsStartIdx = data.mLocalExpertsStartIdx;
     mLocalExpertsStrideLog2 = data.mLocalExpertsStrideLog2;
     mNumLocalExperts = data.mNumLocalExperts;
@@ -173,8 +177,8 @@ struct Data : public DataBase {
 };
 
 template <typename InputT_, typename BiasT_, typename OutputT_, int MaxNumExperts_, bool UseGroups_,
-          bool UsePdl_>
-struct KernelParams : public KernelParamsBase<InputT_, OutputT_, MaxNumExperts_, UsePdl_> {
+          bool isPow2_, bool UsePdl_>
+struct KernelParams : public KernelParamsBase<InputT_, OutputT_, MaxNumExperts_, isPow2_, UsePdl_> {
   using InputT = InputT_;
   using BiasT = BiasT_;
   using OutputT = OutputT_;
@@ -229,8 +233,8 @@ struct Data : public DataBase {
   tg::Dtype mDtypeExpW{tg::Dtype::Bfloat16};
 };
 
-template <typename InputT_, typename OutputT_, int MaxNumExperts_, bool UsePdl_>
-struct KernelParams : public KernelParamsBase<InputT_, OutputT_, MaxNumExperts_, UsePdl_> {
+template <typename InputT_, typename OutputT_, int MaxNumExperts_, bool isPow2_, bool UsePdl_>
+struct KernelParams : public KernelParamsBase<InputT_, OutputT_, MaxNumExperts_, isPow2_, UsePdl_> {
   using InputT = InputT_;
   using OutputT = OutputT_;
 
@@ -268,8 +272,8 @@ struct Data : public DataBase {
 };
 
 template <typename InputT_, typename OutputT_, int MaxNumExperts_, bool DoSoftmaxBeforeTopK_,
-          bool UsePdl_>
-struct KernelParams : public KernelParamsBase<InputT_, OutputT_, MaxNumExperts_, UsePdl_> {
+          bool isPow2_, bool UsePdl_>
+struct KernelParams : public KernelParamsBase<InputT_, OutputT_, MaxNumExperts_, isPow2_, UsePdl_> {
   using InputT = InputT_;
   using OutputT = OutputT_;
 
diff --git a/include/flashinfer/trtllm/fused_moe/noAuxTcKernels.h b/include/flashinfer/trtllm/fused_moe/noAuxTcKernels.h
new file mode 100644
index 0000000000..5af8fe39db
--- /dev/null
+++ b/include/flashinfer/trtllm/fused_moe/noAuxTcKernels.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#include "tensorrt_llm/common/cudaUtils.h"
+
+namespace tensorrt_llm::kernels {
+
+template <typename InputT, typename BiasT, typename OutputT, typename IdxT>
+void invokeNoAuxTc(InputT* scores, BiasT* bias, OutputT* topk_values, IdxT* topk_indices,
+                   int64_t const num_tokens, int64_t const num_experts, int64_t const n_group,
+                   int64_t const topk_group, int64_t const topk, double const routed_scaling_factor,
+                   cudaStream_t const stream = 0);
+
+}  // namespace tensorrt_llm::kernels
diff --git a/include/flashinfer/trtllm/fused_moe/runner.h b/include/flashinfer/trtllm/fused_moe/runner.h
index 8d99902d67..3941a23249 100644
--- a/include/flashinfer/trtllm/fused_moe/runner.h
+++ b/include/flashinfer/trtllm/fused_moe/runner.h
@@ -305,7 +305,11 @@ struct MoEWorkspace {
   int32_t* expanded_idx_to_permuted_idx = nullptr;
   int32_t* permuted_idx_to_expanded_idx = nullptr;
   int32_t* permuted_idx_to_token_idx = nullptr;
+
+  // consumed by finalize kernel
   void* expert_weights = nullptr;  // [num_tokens, top_k] in bfloat16 = mDtypeExpW
+  // consumed by permuteGemm1 kernel
+  void* token_scales = nullptr;
 
   int32_t* cta_idx_xy_to_batch_idx = nullptr;
   int32_t* cta_idx_xy_to_mn_limit = nullptr;
diff --git a/include/flashinfer/utils.cuh b/include/flashinfer/utils.cuh
index 5b26d7beaf..0471bd1081 100644
--- a/include/flashinfer/utils.cuh
+++ b/include/flashinfer/utils.cuh
@@ -201,6 +201,52 @@
     }                                                  \
   }
 
+// convert interleave to compile-time constant
+#define DISPATCH_INTERLEAVE(interleave, INTERLEAVE, ...) \
+  if (interleave) {                                      \
+    constexpr bool INTERLEAVE = true;                    \
+    __VA_ARGS__                                          \
+  } else {                                               \
+    constexpr bool INTERLEAVE = false;                   \
+    __VA_ARGS__                                          \
+  }
+
+#define DISPATCH_ROPE_DIM(rope_dim, ROPE_DIM, ...)           \
+  switch (rope_dim) {                                        \
+    case 16: {                                               \
+      constexpr uint32_t ROPE_DIM = 16;                      \
+      __VA_ARGS__                                            \
+      break;                                                 \
+    }                                                        \
+    case 32: {                                               \
+      constexpr uint32_t ROPE_DIM = 32;                      \
+      __VA_ARGS__                                            \
+      break;                                                 \
+    }                                                        \
+    case 64: {                                               \
+      constexpr uint32_t ROPE_DIM = 64;                      \
+      __VA_ARGS__                                            \
+      break;                                                 \
+    }                                                        \
+    case 128: {                                              \
+      constexpr uint32_t ROPE_DIM = 128;                     \
+      __VA_ARGS__                                            \
+      break;                                                 \
+    }                                                        \
+    case 256: {                                              \
+      constexpr uint32_t ROPE_DIM = 256;                     \
+      __VA_ARGS__                                            \
+      break;                                                 \
+    }                                                        \
+    default: {                                               \
+      std::ostringstream err_msg;                            \
+      err_msg << "Unsupported ROPE_DIM: " << rope_dim;       \
+      err_msg << ". Supported values: 16, 32, 64, 128, 256"; \
+      err_msg << " in DISPATCH_ROPE_DIM";                    \
+      FLASHINFER_ERROR(err_msg.str());                       \
+    }                                                        \
+  }
+
 #define DISPATCH_POS_ENCODING_MODE(pos_encoding_mode, POS_ENCODING_MODE, ...)    \
   switch (pos_encoding_mode) {                                                   \
     case PosEncodingMode::kNone: {                                               \
diff --git a/pyproject.toml b/pyproject.toml
index 57a966c04d..326f1e6ab9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,7 +15,7 @@
 [project]
 name = "flashinfer-python"
 description = "FlashInfer: Kernel Library for LLM Serving"
-requires-python = ">=3.9,<4.0"
+requires-python = ">=3.10,<4.0"
 authors = [{ name = "FlashInfer team" }]
 license = "Apache-2.0"
 readme = "README.md"
@@ -27,7 +27,7 @@ license-files = ["LICENSE", "LICENSE*.txt"]
 flashinfer = "flashinfer.__main__:cli"
 
 [build-system]
-requires = ["setuptools>=77", "packaging>=24", "apache-tvm-ffi>=0.1,<0.2"]
+requires = ["setuptools>=77", "packaging>=24", "apache-tvm-ffi>=0.1.4,<0.2"]
 build-backend = "build_backend"
 backend-path = ["."]
 
diff --git a/requirements.txt b/requirements.txt
index a31b6ebdc8..9cbe91cac4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,10 @@
-apache-tvm-ffi>=0.1,<0.2
+apache-tvm-ffi>=0.1.4,<0.2
 click
 einops
 ninja
 numpy
 nvidia-cudnn-frontend>=1.13.0
-nvidia-cutlass-dsl>=4.2.1
+nvidia-cutlass-dsl>=4.3.2
 nvidia-ml-py
 packaging>=24.2
 requests
diff --git a/scripts/build_flashinfer_jit_cache_whl.sh b/scripts/build_flashinfer_jit_cache_whl.sh
index 4d00ae67f0..ad35fbf640 100755
--- a/scripts/build_flashinfer_jit_cache_whl.sh
+++ b/scripts/build_flashinfer_jit_cache_whl.sh
@@ -11,7 +11,8 @@ echo "=========================================="
 # MAX_JOBS = min(nproc, max(1, MemAvailable_GB/4))
 MEM_AVAILABLE_GB=$(free -g | awk '/^Mem:/ {print $7}')
 NPROC=$(nproc)
-MAX_JOBS=$(( MEM_AVAILABLE_GB / $([ "$(uname -m)" = "aarch64" ] && echo 8 || echo 4) ))
+# MAX_JOBS=$(( MEM_AVAILABLE_GB / $([ "$(uname -m)" = "aarch64" ] && echo 8 || echo 4) ))
+MAX_JOBS=$(( MEM_AVAILABLE_GB / 8 ))
 if (( MAX_JOBS < 1 )); then
   MAX_JOBS=1
 elif (( NPROC < MAX_JOBS )); then
diff --git a/scripts/task_jit_run_tests_part2.sh b/scripts/task_jit_run_tests_part2.sh
index b4bb6bf17c..c85d9d7e28 100755
--- a/scripts/task_jit_run_tests_part2.sh
+++ b/scripts/task_jit_run_tests_part2.sh
@@ -11,10 +11,10 @@ if [ "$SKIP_INSTALL" = "0" ]; then
 fi
 
 # Run each test file separately to isolate CUDA memory issues
-pytest -s tests/utils/test_block_sparse.py
 pytest -s tests/utils/test_jit_example.py
 pytest -s tests/utils/test_jit_warmup.py
 pytest -s tests/utils/test_norm.py
+pytest -s tests/attention/test_block_sparse.py
 pytest -s tests/attention/test_rope.py
 pytest -s tests/attention/test_mla_page.py
 pytest -s tests/utils/test_quantization.py
diff --git a/scripts/task_test_blackwell_kernels.sh b/scripts/task_test_blackwell_kernels.sh
index fb35e168af..0d7b0b1f4a 100644
--- a/scripts/task_test_blackwell_kernels.sh
+++ b/scripts/task_test_blackwell_kernels.sh
@@ -6,6 +6,13 @@ set -eo pipefail
 : ${MAX_JOBS:=$(nproc)}
 : ${CUDA_VISIBLE_DEVICES:=0}
 
+# Clean Python bytecode cache to avoid stale imports (e.g., after module refactoring)
+echo "Cleaning Python bytecode cache..."
+find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
+find . -type f -name '*.pyc' -delete 2>/dev/null || true
+echo "Cache cleaned."
+echo ""
+
 # Pytest configuration flags
 PYTEST_FLAGS="--continue-on-collection-errors -s"
 
@@ -18,7 +25,75 @@ if [[ "$1" == "--dry-run" ]] || [[ "${DRY_RUN}" == "true" ]]; then
 fi
 
 if [ "$DRY_RUN" != "true" ]; then
+    echo "Using CUDA version: ${CUDA_VERSION}"
+    echo ""
+
+    # Install precompiled kernels (require CI build artifacts)
+    JIT_ARCH_EFFECTIVE=""
+    # Map CUDA_VERSION to CUDA_STREAM for artifact lookup
+    if [[ "${CUDA_VERSION}" == cu* ]]; then
+        CUDA_STREAM="${CUDA_VERSION}"
+    elif [ "${CUDA_VERSION}" = "12.9.0" ]; then
+        CUDA_STREAM="cu129"
+    else
+        CUDA_STREAM="cu130"
+    fi
+    echo "Using CUDA stream: ${CUDA_STREAM}"
+    echo ""
+    if [ -n "${JIT_ARCH}" ]; then
+        # 12.0a for CUDA 12.9.0, 12.0f for CUDA 13.0.0
+        if [ "${JIT_ARCH}" = "12.0" ]; then
+            if [ "${CUDA_STREAM}" = "cu129" ]; then
+                JIT_ARCH_EFFECTIVE="12.0a"
+            else
+                JIT_ARCH_EFFECTIVE="12.0f"
+            fi
+        else
+            JIT_ARCH_EFFECTIVE="${JIT_ARCH}"
+        fi
+
+        echo "Using JIT_ARCH from environment: ${JIT_ARCH_EFFECTIVE}"
+        DIST_CUBIN_DIR="../dist/${CUDA_STREAM}/${JIT_ARCH_EFFECTIVE}/cubin"
+        DIST_JIT_CACHE_DIR="../dist/${CUDA_STREAM}/${JIT_ARCH_EFFECTIVE}/jit-cache"
+
+        echo "==== Debug: listing artifact directories ===="
+        echo "Tree under ../dist:"
+        (cd .. && ls -al dist) || true
+        echo ""
+        echo "Tree under ../dist/${CUDA_STREAM}:"
+        (cd .. && ls -al "dist/${CUDA_STREAM}") || true
+        echo ""
+        echo "Contents of ${DIST_CUBIN_DIR}:"
+        ls -al "${DIST_CUBIN_DIR}" || true
+        echo ""
+        echo "Contents of ${DIST_JIT_CACHE_DIR}:"
+        ls -al "${DIST_JIT_CACHE_DIR}" || true
+        echo "============================================="
+
+        if [ -d "${DIST_CUBIN_DIR}" ] && ls "${DIST_CUBIN_DIR}"/*.whl >/dev/null 2>&1; then
+            echo "Installing flashinfer-cubin from ${DIST_CUBIN_DIR} ..."
+            pip install -q "${DIST_CUBIN_DIR}"/*.whl
+        else
+            echo "ERROR: flashinfer-cubin wheel not found in ${DIST_CUBIN_DIR}. Ensure the CI build stage produced the artifact." >&2
+        fi
+
+        if [ -d "${DIST_JIT_CACHE_DIR}" ] && ls "${DIST_JIT_CACHE_DIR}"/*.whl >/dev/null 2>&1; then
+            echo "Installing flashinfer-jit-cache from ${DIST_JIT_CACHE_DIR} ..."
+            pip install -q "${DIST_JIT_CACHE_DIR}"/*.whl
+        else
+            echo "ERROR: flashinfer-jit-cache wheel not found in ${DIST_JIT_CACHE_DIR} for ${CUDA_VERSION}. Ensure the CI build stage produced the artifact." >&2
+        fi
+        echo ""
+    fi
+
+    # Install local python sources
     pip install -e . -v --no-deps
+    echo ""
+
+    # Verify installation
+    echo "Verifying installation..."
+    (cd /tmp && python -m flashinfer show-config)
+    echo ""
 fi
 
 EXIT_CODE=0
diff --git a/scripts/task_test_jit_cache_package_build_import.sh b/scripts/task_test_jit_cache_package_build_import.sh
index c8e4cfc6b6..0627d7b82d 100755
--- a/scripts/task_test_jit_cache_package_build_import.sh
+++ b/scripts/task_test_jit_cache_package_build_import.sh
@@ -28,6 +28,12 @@ export MAX_JOBS
 : ${CUDA_VISIBLE_DEVICES:=""}
 echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}"
 
+# Clean Python bytecode cache to avoid stale imports (e.g., after module refactoring)
+echo "Cleaning Python bytecode cache..."
+find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
+find . -type f -name '*.pyc' -delete 2>/dev/null || true
+echo "Cache cleaned."
+
 echo ""
 echo "Detecting CUDA architecture list..."
 export FLASHINFER_CUDA_ARCH_LIST=$(python3 -c '
@@ -37,7 +43,16 @@ arches = ["7.5", "8.0", "8.9", "9.0a"]
 if cuda_ver is not None:
     try:
         major, minor = map(int, cuda_ver.split(".")[:2])
-        if (major, minor) >= (12, 8):
+        if (major, minor) >= (13, 0):
+            arches.append("10.0a")
+            arches.append("10.3a")
+            arches.append("11.0a")
+            arches.append("12.0f")
+        elif (major, minor) >= (12, 9):
+            arches.append("10.0a")
+            arches.append("10.3a")
+            arches.append("12.0f")
+        elif (major, minor) >= (12, 8):
             arches.append("10.0a")
             arches.append("12.0a")
     except Exception:
diff --git a/scripts/task_test_multi_node_comm_kernels.sh b/scripts/task_test_multi_node_comm_kernels.sh
index 7ece7fe0ad..f1dcedc93b 100644
--- a/scripts/task_test_multi_node_comm_kernels.sh
+++ b/scripts/task_test_multi_node_comm_kernels.sh
@@ -5,6 +5,13 @@ set -x
 : ${MAX_JOBS:=$(nproc)}
 : ${CUDA_VISIBLE_DEVICES:=0}
 
+# Clean Python bytecode cache to avoid stale imports (e.g., after module refactoring)
+echo "Cleaning Python bytecode cache..."
+find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
+find . -type f -name '*.pyc' -delete 2>/dev/null || true
+echo "Cache cleaned."
+echo ""
+
 pip install -e . -v
 
 pytest -s tests/comm/test_mnnvl_memory.py
diff --git a/scripts/task_test_nightly_build.sh b/scripts/task_test_nightly_build.sh
index 46f6b76d36..ad7773b5ab 100755
--- a/scripts/task_test_nightly_build.sh
+++ b/scripts/task_test_nightly_build.sh
@@ -12,6 +12,13 @@ set -x
 : ${DIST_JIT_CACHE_DIR:=dist-jit-cache}
 : ${DIST_PYTHON_DIR:=dist-python}
 
+# Clean Python bytecode cache to avoid stale imports (e.g., after module refactoring)
+echo "Cleaning Python bytecode cache..."
+find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
+find . -type f -name '*.pyc' -delete 2>/dev/null || true
+echo "Cache cleaned."
+echo ""
+
 # Display GPU information (running inside Docker container with GPU access)
 echo "=== GPU Information ==="
 nvidia-smi
diff --git a/scripts/task_test_single_node_comm_kernels.sh b/scripts/task_test_single_node_comm_kernels.sh
index 4d9c4ff3f3..19593258db 100644
--- a/scripts/task_test_single_node_comm_kernels.sh
+++ b/scripts/task_test_single_node_comm_kernels.sh
@@ -5,6 +5,13 @@ set -x
 : ${MAX_JOBS:=$(nproc)}
 : ${CUDA_VISIBLE_DEVICES:=0}
 
+# Clean Python bytecode cache to avoid stale imports (e.g., after module refactoring)
+echo "Cleaning Python bytecode cache..."
+find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
+find . -type f -name '*.pyc' -delete 2>/dev/null || true
+echo "Cache cleaned."
+echo ""
+
 pip install -e . -v
 
 # vllm ar
diff --git a/scripts/update_whl_index.py b/scripts/update_whl_index.py
index 474ec61ea9..cb9ed3d183 100644
--- a/scripts/update_whl_index.py
+++ b/scripts/update_whl_index.py
@@ -31,7 +31,11 @@ def get_package_info(wheel_path: pathlib.Path) -> Optional[dict]:
     wheel_name = wheel_path.name
 
     # Try flashinfer-python pattern
-    match = re.match(r"flashinfer_python-([0-9.]+(?:\.dev\d+)?)-", wheel_name)
+    # Supports PEP 440: base_version[{a|b|rc}N][.postN][.devN]
+    match = re.match(
+        r"flashinfer_python-([0-9.]+(?:(?:a|b|rc)\d+)?(?:\.post\d+)?(?:\.dev\d+)?)-",
+        wheel_name,
+    )
     if match:
         version = match.group(1)
         return {
@@ -41,7 +45,11 @@ def get_package_info(wheel_path: pathlib.Path) -> Optional[dict]:
         }
 
     # Try flashinfer-cubin pattern
-    match = re.match(r"flashinfer_cubin-([0-9.]+(?:\.dev\d+)?)-", wheel_name)
+    # Supports PEP 440: base_version[{a|b|rc}N][.postN][.devN]
+    match = re.match(
+        r"flashinfer_cubin-([0-9.]+(?:(?:a|b|rc)\d+)?(?:\.post\d+)?(?:\.dev\d+)?)-",
+        wheel_name,
+    )
     if match:
         version = match.group(1)
         return {
@@ -51,7 +59,11 @@ def get_package_info(wheel_path: pathlib.Path) -> Optional[dict]:
         }
 
     # Try flashinfer-jit-cache pattern (has CUDA suffix in version)
-    match = re.match(r"flashinfer_jit_cache-([0-9.]+(?:\.dev\d+)?\+cu\d+)-", wheel_name)
+    # Supports PEP 440: base_version[{a|b|rc}N][.postN][.devN]+cuXXX
+    match = re.match(
+        r"flashinfer_jit_cache-([0-9.]+(?:(?:a|b|rc)\d+)?(?:\.post\d+)?(?:\.dev\d+)?\+cu\d+)-",
+        wheel_name,
+    )
     if match:
         version = match.group(1)
         cuda_ver = get_cuda_version(wheel_name)
diff --git a/tests/attention/test_batch_prefill_kernels.py b/tests/attention/test_batch_prefill_kernels.py
index f067a70c62..0242c28ea9 100644
--- a/tests/attention/test_batch_prefill_kernels.py
+++ b/tests/attention/test_batch_prefill_kernels.py
@@ -144,13 +144,17 @@ def test_batch_prefill_with_paged_kv_cache(
             logits_soft_cap=logits_soft_cap,
         )
         if return_lse:
-            o, _ = wrapper.run(q, kv_data, return_lse=True)
+            o, lse = wrapper.run(q, kv_data, return_lse=True)
         else:
             o = wrapper.run(q, kv_data)
 
         # test with pre-allocated output
         o_buffer = torch.empty_like(o)
-        wrapper.run(q, kv_data, out=o_buffer)
+        if return_lse:
+            lse_buffer = torch.empty_like(lse)
+            wrapper.run(q, kv_data, out=o_buffer, lse=lse_buffer, return_lse=True)
+        else:
+            wrapper.run(q, kv_data, out=o_buffer)
         torch.testing.assert_close(o, o_buffer, rtol=1e-3, atol=1e-3)
     else:
         q_indptr_buffer = torch.empty(
diff --git a/tests/utils/test_block_sparse.py b/tests/attention/test_block_sparse.py
similarity index 100%
rename from tests/utils/test_block_sparse.py
rename to tests/attention/test_block_sparse.py
diff --git a/tests/attention/test_fmha_v2_prefill_deepseek.py b/tests/attention/test_fmha_v2_prefill_deepseek.py
new file mode 100755
index 0000000000..2dad1355ce
--- /dev/null
+++ b/tests/attention/test_fmha_v2_prefill_deepseek.py
@@ -0,0 +1,170 @@
+import pytest
+import torch
+import math
+
+
+from flashinfer.prefill import fmha_v2_prefill_deepseek
+from tests.utils_fp8 import to_float8
+from flashinfer.utils import is_sm120a_supported
+
+
+def attention_ref(
+    batch_size,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    causal: bool,
+    sm_scale: float,
+) -> torch.Tensor:
+    # tensors are (batch_size, seq_len, num_heads, head_dim)
+    qo_len = q.shape[1]
+    kv_len = k.shape[1]
+    logits = torch.einsum("bmhd,bnhd->bhmn", q.float(), k.float()) * sm_scale
+
+    if causal:
+        mask = torch.arange(kv_len - qo_len, kv_len, device=q.device).unsqueeze(
+            1
+        ) >= torch.arange(0, kv_len, device=q.device).unsqueeze(0)
+    else:
+        mask = torch.ones(qo_len, kv_len, device=q.device)
+
+    logits = logits.masked_fill(mask.unsqueeze(0).unsqueeze(0) == 0, float("-inf"))
+    # LSE computation: logsumexp over the key dimension (last dim)
+    # logits shape: (batch, num_heads, seq_len, seq_len)
+    lse_ref = torch.logsumexp(logits, -1)  # (batch, num_heads, seq_len)
+    # Transpose to match expected shape (batch, seq_len, num_heads)
+    lse_ref = lse_ref.transpose(1, 2)
+    p = torch.softmax(logits, dim=-1)
+    o_ref = torch.einsum("bhmn,bnhd->bmhd", p, v.float()).contiguous()
+
+    # Return LSE in natural log (no conversion needed)
+    return o_ref, lse_ref
+
+
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("num_heads", [8])
+@pytest.mark.parametrize("head_dim_qk", [192])
+@pytest.mark.parametrize("head_dim_v", [128])
+@pytest.mark.parametrize("seq_len", [1024, 4096, 8192])
+@pytest.mark.parametrize(
+    "qkv_dtype,o_dtype",
+    [
+        (torch.bfloat16, torch.bfloat16),
+        (torch.float8_e4m3fn, torch.bfloat16),
+        (torch.float8_e4m3fn, torch.float16),
+    ],
+)
+def test_fmha_v2_prefill_deepseek(
+    batch_size, num_heads, head_dim_qk, head_dim_v, seq_len, qkv_dtype, o_dtype
+):
+    if not is_sm120a_supported(torch.device("cuda")):
+        pytest.skip("fmha_v2_prefill_deepseek is only supported on SM120 GPUs.")
+    torch.manual_seed(42)
+
+    def initialize_tensors(batch_size, num_heads, head_dim_qk, head_dim_v, seq_len):
+        device = "cuda"
+        if qkv_dtype == torch.float8_e4m3fn:
+            q = torch.randn(
+                (batch_size, seq_len, num_heads, head_dim_qk),
+                dtype=torch.bfloat16,
+                device=device,
+            )
+            k = torch.randn(
+                (batch_size, seq_len, num_heads, head_dim_qk),
+                dtype=torch.bfloat16,
+                device=device,
+            )
+            v = torch.randn(
+                (batch_size, seq_len, num_heads, head_dim_v),
+                dtype=torch.bfloat16,
+                device=device,
+            )
+
+            q, q_scale = to_float8(q, dtype=torch.float8_e4m3fn)
+            k, k_scale = to_float8(k, dtype=torch.float8_e4m3fn)
+            v, v_scale = to_float8(v, dtype=torch.float8_e4m3fn)
+            q_scale = q_scale.item()
+            k_scale = k_scale.item()
+            v_scale = v_scale.item()
+        else:
+            q = torch.randn(
+                (batch_size, seq_len, num_heads, head_dim_qk),
+                dtype=qkv_dtype,
+                device=device,
+            )
+            k = torch.randn(
+                (batch_size, seq_len, num_heads, head_dim_qk),
+                dtype=qkv_dtype,
+                device=device,
+            )
+            v = torch.randn(
+                (batch_size, seq_len, num_heads, head_dim_v),
+                dtype=qkv_dtype,
+                device=device,
+            )
+            # For non-FP8 case, scales are 1.0
+            q_scale = 1.0
+            k_scale = 1.0
+            v_scale = 1.0
+
+        # Output and statistics
+        o = torch.zeros(
+            batch_size, seq_len, num_heads, head_dim_v, dtype=o_dtype, device=device
+        )
+        lse = torch.zeros(
+            batch_size, seq_len, num_heads, 2, dtype=torch.float, device=device
+        )
+        sm_scale = 1.0 / math.sqrt(head_dim_qk)
+        return q, k, v, o, lse, sm_scale, q_scale, k_scale, v_scale
+
+    q, k, v, o, lse, sm_scale, q_scale, k_scale, v_scale = initialize_tensors(
+        batch_size, num_heads, head_dim_qk, head_dim_v, seq_len
+    )
+    scale_bmm1 = q_scale * k_scale * sm_scale
+    scale_bmm2 = v_scale
+    scale_softmax = 1.0 if qkv_dtype == torch.float8_e4m3fn else 0.0
+    out, lse = fmha_v2_prefill_deepseek(
+        q,
+        k,
+        v,
+        o,
+        num_heads,
+        head_dim_qk,
+        seq_len,
+        scale_softmax=scale_softmax,
+        scale_bmm1=scale_bmm1,
+        scale_bmm2=scale_bmm2,
+        return_lse=True,
+        lse=lse,
+    )
+    # implementation gives [max(s_i), sum(exp(s_i - max(s_i)))], compute lse from this
+    if qkv_dtype == torch.float8_e4m3fn:
+        # For E4M3 the softmax is scaled by 256 (the largest power-of-2 below E4M3_MAX=448.0)
+        descale = 256
+        lse = lse[:, :, :, 0] + torch.log(lse[:, :, :, 1] / descale)
+    else:
+        lse = lse[:, :, :, 0] + torch.log(lse[:, :, :, 1])
+
+    if qkv_dtype == torch.float8_e4m3fn:
+        q_32 = q.to(torch.float32) * q_scale
+        k_32 = k.to(torch.float32) * k_scale
+        v_32 = v.to(torch.float32) * v_scale
+        out_ref, lse_ref = attention_ref(
+            batch_size, q_32, k_32, v_32, causal=True, sm_scale=sm_scale
+        )
+    else:
+        out_ref, lse_ref = attention_ref(
+            batch_size, q, k, v, causal=True, sm_scale=sm_scale
+        )
+        out_ref = out_ref.to(o.dtype)
+
+    if q.dtype == torch.float8_e4m3fn and o.dtype == torch.bfloat16:
+        rtol, atol = 4e-2, 6e-2
+        torch.testing.assert_close(out, out_ref.to(o.dtype), rtol=rtol, atol=atol)
+    elif q.dtype == torch.bfloat16 and o.dtype == torch.bfloat16:
+        rtol, atol = 1e-2, 1e-2
+        torch.testing.assert_close(out, out_ref, rtol=rtol, atol=atol)
+    else:
+        rtol, atol = 1e-2, 1e-3
+
+    torch.testing.assert_close(lse, lse_ref, rtol=1e-2, atol=1e-3)
diff --git a/tests/attention/test_fp8_prefill.py b/tests/attention/test_fp8_prefill.py
index 414173f452..1b8ebc75cc 100644
--- a/tests/attention/test_fp8_prefill.py
+++ b/tests/attention/test_fp8_prefill.py
@@ -66,7 +66,7 @@ def test_batch_prefill_with_paged_kv_cache_fp8_calibration_scale(
 
     workspace_buffer = torch.empty(32 * 1024 * 1024, dtype=torch.int8).to(0)
     wrapper_f16 = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
-        workspace_buffer, kv_layout
+        workspace_buffer, kv_layout, backend="fa2"
     )
     wrapper_f16.plan(
         qo_indptr,
@@ -90,7 +90,7 @@ def test_batch_prefill_with_paged_kv_cache_fp8_calibration_scale(
     kv_data_fp8 = torch.cat([k_fp8, v_fp8], dim=1)
 
     wrapper_f8 = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
-        workspace_buffer, kv_layout
+        workspace_buffer, kv_layout, backend="fa2"
     )
     wrapper_f8.plan(
         qo_indptr,
@@ -156,7 +156,7 @@ def test_batch_decode_with_prefill_with_paged_kv_cache(
 
     workspace_buffer = torch.empty(32 * 1024 * 1024, dtype=torch.int8).to(0)
     wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
-        workspace_buffer, kv_layout
+        workspace_buffer, kv_layout, backend="fa2"
     )
     wrapper.plan(
         qo_indptr,
@@ -173,7 +173,7 @@ def test_batch_decode_with_prefill_with_paged_kv_cache(
     o_fp8 = wrapper.run(q, kv_data)
 
     decode_wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
-        workspace_buffer, kv_layout
+        workspace_buffer, kv_layout, backend="fa2"
     )
     decode_wrapper.plan(
         kv_indptr,
diff --git a/tests/attention/test_hopper.py b/tests/attention/test_hopper.py
index f928e213bb..6334a23dfe 100644
--- a/tests/attention/test_hopper.py
+++ b/tests/attention/test_hopper.py
@@ -194,13 +194,20 @@ def test_deepseek_prefill(
     )
     o_sm90, lse_sm90 = wrapper_sm90.run_return_lse(q, k, v)
 
-    torch.testing.assert_close(lse_sm80, lse_sm90, rtol=1e-3, atol=1e-3)
-    torch.testing.assert_close(o_sm80, o_sm90, rtol=1e-3, atol=1e-3)
+    if dtype == torch.half:
+        rtol = 1e-3
+        atol = 1e-3
+    else:  # bfloat16
+        rtol = 1e-2
+        atol = 1e-2
+
+    torch.testing.assert_close(lse_sm80, lse_sm90, rtol=rtol, atol=atol)
+    torch.testing.assert_close(o_sm80, o_sm90, rtol=rtol, atol=atol)
 
 
 @pytest.mark.parametrize("batch_size", [1, 4, 8, 16])
 @pytest.mark.parametrize("seq_len", [11, 12, 99, 1763, 9999, 32767])
-@pytest.mark.parametrize("page_size", [1])  # [1, 16])
+@pytest.mark.parametrize("page_size", [1, 16])
 @pytest.mark.parametrize("num_qo_heads", [1, 4, 8])
 @pytest.mark.parametrize("num_kv_heads", [1, 4, 8])
 @pytest.mark.parametrize("causal", [False, True])
@@ -260,8 +267,7 @@ def test_batch_paged_prefill(
     kv_indptr = torch.arange(
         0, batch_size * num_pages_per_request + 1, num_pages_per_request
     ).int()
-    # NOTE(Zihao): pad 256 elements to avoid out-of-bound because we didn't check the boundary in the kernel
-    kv_indices = torch.arange(0, batch_size * num_pages_per_request + 256).int()
+    kv_indices = torch.arange(0, batch_size * num_pages_per_request).int()
     last_page_len = torch.full((batch_size,), last_page_len, dtype=torch.int32)
 
     wrapper_sm80.plan(
@@ -373,9 +379,7 @@ def test_batch_prefill_with_paged_kv_cache_multi_item_scoring_fa3(
         token_pos_in_items_ptr=torch.tensor(token_pos_in_items_ptr)
         .to(dtype=torch.uint16)
         .to(0),
-        token_pos_in_items_len=torch.tensor(token_pos_in_items_len)
-        .to(dtype=torch.uint32)
-        .to(0),
+        token_pos_in_items_len=token_pos_in_items_len,
         max_item_len_ptr=torch.tensor(max_item_len_ptr).to(dtype=torch.uint16).to(0),
     )
     o_fa2, lse_fa2 = wrapper_fa2.run_return_lse(q, kv_data)
@@ -398,9 +402,7 @@ def test_batch_prefill_with_paged_kv_cache_multi_item_scoring_fa3(
         token_pos_in_items_ptr=torch.tensor(token_pos_in_items_ptr)
         .to(dtype=torch.uint16)
         .to(0),
-        token_pos_in_items_len=torch.tensor(token_pos_in_items_len)
-        .to(dtype=torch.uint32)
-        .to(0),
+        token_pos_in_items_len=token_pos_in_items_len,
         max_item_len_ptr=torch.tensor(max_item_len_ptr).to(dtype=torch.uint16).to(0),
     )
 
@@ -432,7 +434,7 @@ def test_batch_prefill_with_paged_kv_cache_multi_item_scoring_fa3(
             97,
             81,
             [16, 16],
-            list(range(80)) + [0] + [0] * 16 + list(range(76)) + [0],
+            list(range(80)) + [0] * 17 + list(range(76)) + [0] * 5,
             97,
             [79, 75],
         ),
@@ -507,9 +509,7 @@ def test_batch_prefill_with_paged_kv_cache_multi_item_scoring_fa3_bsz2(
         token_pos_in_items_ptr=torch.tensor(token_pos_in_items_ptr)
         .to(dtype=torch.uint16)
         .to(0),
-        token_pos_in_items_len=torch.tensor(token_pos_in_items_len)
-        .to(dtype=torch.uint32)
-        .to(0),
+        token_pos_in_items_len=token_pos_in_items_len,
         max_item_len_ptr=torch.tensor(max_item_len_ptr).to(dtype=torch.uint16).to(0),
     )
     o_fa2, lse_fa2 = wrapper_fa2.run_return_lse(q, kv_data)
@@ -532,9 +532,7 @@ def test_batch_prefill_with_paged_kv_cache_multi_item_scoring_fa3_bsz2(
         token_pos_in_items_ptr=torch.tensor(token_pos_in_items_ptr)
         .to(dtype=torch.uint16)
         .to(0),
-        token_pos_in_items_len=torch.tensor(token_pos_in_items_len)
-        .to(dtype=torch.uint32)
-        .to(0),
+        token_pos_in_items_len=token_pos_in_items_len,
         max_item_len_ptr=torch.tensor(max_item_len_ptr).to(dtype=torch.uint16).to(0),
     )
 
diff --git a/tests/attention/test_hopper_fp8_attention.py b/tests/attention/test_hopper_fp8_attention.py
index 35f102b21c..e9add67910 100644
--- a/tests/attention/test_hopper_fp8_attention.py
+++ b/tests/attention/test_hopper_fp8_attention.py
@@ -9,21 +9,32 @@
 from flashinfer.utils import is_sm90a_supported
 
 
+def get_fp8_dtype_minmax(dtype: torch.dtype) -> Tuple[float, float]:
+    """Get min/max representable values for FP8 dtype."""
+    if dtype == torch.float8_e4m3fn:
+        return -448.0, 448.0
+    elif dtype == torch.float8_e5m2:
+        return -57344, 57344
+    else:
+        raise ValueError(f"Unsupported quantization dtype: {dtype}")
+
+
 def per_head_symmetric_quant(
     x: torch.Tensor, quant_dtype: torch.dtype
 ) -> Tuple[torch.Tensor, torch.Tensor]:
-    # x: [seq_len, num_heads, head_dim]
-    assert quant_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]
+    """Quantize tensor with per-head scale factors.
 
-    def get_dtype_minmax(dtype: torch.dtype) -> Tuple[float, float]:
-        if dtype == torch.float8_e4m3fn:
-            return -448.0, 448.0
-        elif dtype == torch.float8_e5m2:
-            return -57344, 57344
-        else:
-            raise ValueError(f"Unsupported quantization dtype: {dtype}")
+    Args:
+        x: Input tensor of shape [seq_len, num_heads, head_dim]
+        quant_dtype: FP8 dtype (e4m3 or e5m2)
 
-    o_min_val, o_max_val = get_dtype_minmax(quant_dtype)
+    Returns:
+        Tuple of (quantized tensor, per-head scales of shape [num_heads])
+    """
+    assert quant_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]
+
+    o_min_val, o_max_val = get_fp8_dtype_minmax(quant_dtype)
+    # Compute max per head: reduce over seq_len and head_dim
     x_max_val = x.abs().amax(dim=(0, 2)).to(dtype=torch.float32)
 
     s_out = torch.clamp(x_max_val / o_max_val, min=1e-6)
@@ -41,6 +52,59 @@ def get_dtype_minmax(dtype: torch.dtype) -> Tuple[float, float]:
     return q_x_out, s_out
 
 
+def per_tensor_symmetric_quant(
+    x: torch.Tensor, quant_dtype: torch.dtype
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Quantize tensor with a single per-tensor scale factor.
+
+    Args:
+        x: Input tensor of shape [seq_len, num_heads, head_dim]
+        quant_dtype: FP8 dtype (e4m3 or e5m2)
+
+    Returns:
+        Tuple of (quantized tensor, per-tensor scale of shape [1])
+    """
+    assert quant_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]
+
+    o_min_val, o_max_val = get_fp8_dtype_minmax(quant_dtype)
+    # Compute max over entire tensor
+    x_max_val = x.abs().amax().to(dtype=torch.float32)
+
+    s_out = torch.clamp(x_max_val / o_max_val, min=1e-6).view(1)
+
+    q_x_out = torch.clamp(
+        x / s_out,
+        min=o_min_val,
+        max=o_max_val,
+    ).to(dtype=quant_dtype)
+
+    assert not torch.any(torch.isnan(q_x_out))
+    assert not torch.any(torch.isnan(s_out))
+
+    return q_x_out, s_out
+
+
+def broadcast_scale_to_per_head(scale: torch.Tensor, num_heads: int) -> torch.Tensor:
+    """Broadcast per-tensor scale to per-head scale if needed.
+
+    Args:
+        scale: Scale tensor of shape [1] (per-tensor) or [num_heads] (per-head)
+        num_heads: Number of heads
+
+    Returns:
+        Scale tensor of shape [num_heads]
+    """
+    if scale.numel() == 1:
+        # Per-tensor scale: broadcast to all heads
+        return scale.expand(num_heads).contiguous()
+    else:
+        # Already per-head scale
+        assert scale.numel() == num_heads, (
+            f"Scale size {scale.numel()} != num_heads {num_heads}"
+        )
+        return scale
+
+
 def bsr_attention_ref(
     q,
     k,
@@ -183,7 +247,595 @@ def test_block_sparse_attention(
     assert mse < 1.0, f"Block sparse MSE too high: {mse.item()}"
 
 
+# Test batch prefill with ragged KV cache: MSE should be below threshold
+@pytest.mark.parametrize("batch_size", [2, 4])
+@pytest.mark.parametrize("num_heads", [8, 32])
+@pytest.mark.parametrize("head_dim", [64, 128, 256])
+@pytest.mark.parametrize("causal", [True, False])
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
+def test_batch_prefill_ragged(batch_size, num_heads, head_dim, causal, dtype):
+    if not is_sm90a_supported(torch.device("cuda")):
+        pytest.skip("SM90A is not supported")
+
+    print(
+        f"Testing FP8 batch prefill ragged with batch_size={batch_size}, num_heads={num_heads}, "
+        f"head_dim={head_dim}, causal={causal}, dtype={dtype}"
+    )
+
+    # Setup
+    o_dtype = torch.half
+    num_qo_heads = num_kv_heads = num_heads
+
+    # Create variable length sequences
+    torch.manual_seed(0)
+    qo_lens = [128 * (i + 1) for i in range(batch_size)]
+    kv_lens = [128 * (i + 1) for i in range(batch_size)]
+
+    # Build ragged tensors
+    qo_indptr = torch.tensor(
+        [0] + [sum(qo_lens[: i + 1]) for i in range(batch_size)],
+        dtype=torch.int32,
+        device="cuda",
+    )
+    kv_indptr = torch.tensor(
+        [0] + [sum(kv_lens[: i + 1]) for i in range(batch_size)],
+        dtype=torch.int32,
+        device="cuda",
+    )
+
+    total_qo_len = sum(qo_lens)
+    total_kv_len = sum(kv_lens)
+
+    # Create input tensors (fp16)
+    q_fp16 = torch.randn(
+        total_qo_len, num_qo_heads, head_dim, dtype=torch.half, device="cuda"
+    )
+    k_fp16 = torch.randn(
+        total_kv_len, num_kv_heads, head_dim, dtype=torch.half, device="cuda"
+    )
+    v_fp16 = torch.randn(
+        total_kv_len, num_kv_heads, head_dim, dtype=torch.half, device="cuda"
+    )
+
+    # Get reference output using fp16
+    wrapper_fp16 = flashinfer.BatchPrefillWithRaggedKVCacheWrapper(
+        torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda"),
+        "NHD",
+        backend="fa3",
+    )
+    wrapper_fp16.plan(
+        qo_indptr,
+        kv_indptr,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        head_dim,
+        causal=causal,
+    )
+    o_ref = wrapper_fp16.run(q_fp16, k_fp16, v_fp16)
+
+    # Quantize to FP8
+    q_fp8, s_q = per_head_symmetric_quant(q_fp16, quant_dtype=dtype)
+    k_fp8, s_k = per_head_symmetric_quant(k_fp16, quant_dtype=dtype)
+    v_fp8, s_v = per_head_symmetric_quant(v_fp16, quant_dtype=dtype)
+
+    # Run FP8 batch prefill with ragged KV cache
+    wrapper_fp8 = flashinfer.BatchPrefillWithRaggedKVCacheWrapper(
+        torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda"),
+        "NHD",
+        backend="fa3",
+    )
+    wrapper_fp8.plan(
+        qo_indptr,
+        kv_indptr,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        head_dim,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+        o_data_type=o_dtype,
+        causal=causal,
+    )
+    o_fp8 = wrapper_fp8.run(q_fp8, k_fp8, v_fp8, s_q, s_k, s_v)
+
+    # Compute MSE
+    mse = torch.mean((o_ref.float() - o_fp8.float()) ** 2)
+    assert mse < 1.0, f"MSE too high: {mse.item()}"
+
+
+def create_per_head_varying_kv(
+    shape: Tuple[int, ...],
+    num_heads: int,
+    head_dim: int,
+    dtype: torch.dtype,
+    device: str,
+) -> torch.Tensor:
+    """Create K/V tensor with per-head varying scale to reveal head offset bugs.
+
+    Each head gets data with a slightly different scale factor: head i gets scale (1 + i*0.1).
+    This ensures that if the kernel incorrectly reads head 0's data for head i,
+    the output will have noticeably different magnitude, causing high MSE.
+    Using small scale differences (0.1 step) to keep quantization error manageable.
+
+    Args:
+        shape: Tensor shape, should contain num_heads dimension
+        num_heads: Number of heads
+        head_dim: Head dimension
+        dtype: Data type
+        device: Device string
+
+    Returns:
+        Tensor with per-head varying scale
+    """
+    # Generate base random tensor
+    tensor = torch.randn(shape, dtype=dtype, device=device)
+
+    # Apply per-head scaling: head i gets multiplied by (1 + i*0.1)
+    # This makes different heads have slightly different magnitudes
+    # Using smaller scale differences to reduce quantization error while still detecting bugs
+    # Shape handling: for paged KV (num_pages, page_size, num_heads, head_dim)
+    # or for flat KV (seq_len, num_heads, head_dim)
+    if len(shape) == 4:
+        # Paged: (num_pages, page_size, num_heads, head_dim)
+        scale = (1.0 + 0.1 * torch.arange(num_heads, dtype=dtype, device=device)).view(
+            1, 1, num_heads, 1
+        )
+    else:
+        # Flat: (seq_len, num_heads, head_dim)
+        scale = (1.0 + 0.1 * torch.arange(num_heads, dtype=dtype, device=device)).view(
+            1, num_heads, 1
+        )
+
+    return tensor * scale
+
+
+# Test batch prefill with paged KV cache: MSE should be below threshold
+@pytest.mark.parametrize("batch_size", [2, 4])
+@pytest.mark.parametrize("num_heads", [8, 32])
+@pytest.mark.parametrize("head_dim", [64, 128, 256])
+@pytest.mark.parametrize("causal", [True, False])
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
+def test_batch_prefill_paged(batch_size, num_heads, head_dim, causal, dtype):
+    if not is_sm90a_supported(torch.device("cuda")):
+        pytest.skip("SM90A is not supported")
+
+    print(
+        f"Testing FP8 batch prefill paged with batch_size={batch_size}, num_heads={num_heads}, "
+        f"head_dim={head_dim}, causal={causal}, dtype={dtype}"
+    )
+
+    # Setup
+    o_dtype = torch.half
+    num_qo_heads = num_kv_heads = num_heads
+    page_size = 16
+
+    # Create variable length sequences
+    torch.manual_seed(0)
+    qo_lens = [128 * (i + 1) for i in range(batch_size)]
+    kv_lens = [128 * (i + 1) for i in range(batch_size)]
+
+    # Build indptr for Q
+    qo_indptr = torch.tensor(
+        [0] + [sum(qo_lens[: i + 1]) for i in range(batch_size)],
+        dtype=torch.int32,
+        device="cuda",
+    )
+
+    total_qo_len = sum(qo_lens)
+
+    # Compute number of pages needed for each sequence
+    kv_page_counts = [(kv_len + page_size - 1) // page_size for kv_len in kv_lens]
+    total_pages = sum(kv_page_counts)
+
+    # Build paged KV indptr and indices
+    kv_indptr = torch.tensor(
+        [0] + [sum(kv_page_counts[: i + 1]) for i in range(batch_size)],
+        dtype=torch.int32,
+        device="cuda",
+    )
+    # Simple page indices: sequential allocation
+    kv_indices = torch.arange(total_pages, dtype=torch.int32, device="cuda")
+    kv_last_page_len = torch.tensor(
+        [
+            kv_len % page_size if kv_len % page_size != 0 else page_size
+            for kv_len in kv_lens
+        ],
+        dtype=torch.int32,
+        device="cuda",
+    )
+
+    # Create input tensors (fp16)
+    q_fp16 = torch.randn(
+        total_qo_len, num_qo_heads, head_dim, dtype=torch.half, device="cuda"
+    )
+    # Paged KV cache: (num_pages, page_size, num_heads, head_dim)
+    # Use per-head varying scale to reveal head offset bugs:
+    # If kernel incorrectly reads head 0's data for all heads, MSE will be high
+    paged_k_fp16 = create_per_head_varying_kv(
+        (total_pages, page_size, num_kv_heads, head_dim),
+        num_kv_heads,
+        head_dim,
+        torch.half,
+        "cuda",
+    )
+    paged_v_fp16 = create_per_head_varying_kv(
+        (total_pages, page_size, num_kv_heads, head_dim),
+        num_kv_heads,
+        head_dim,
+        torch.half,
+        "cuda",
+    )
+
+    # Get reference output using fp16
+    wrapper_fp16 = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+        torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda"),
+        "NHD",
+        backend="fa3",
+    )
+    wrapper_fp16.plan(
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_last_page_len,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        page_size,
+        causal=causal,
+    )
+    o_ref = wrapper_fp16.run(q_fp16, (paged_k_fp16, paged_v_fp16))
+
+    # Quantize to FP8
+    q_fp8, s_q = per_head_symmetric_quant(q_fp16, quant_dtype=dtype)
+    # For paged KV, reshape to (total_tokens, num_heads, head_dim) for quantization
+    k_flat = paged_k_fp16.view(-1, num_kv_heads, head_dim)
+    v_flat = paged_v_fp16.view(-1, num_kv_heads, head_dim)
+    k_fp8_flat, s_k = per_head_symmetric_quant(k_flat, quant_dtype=dtype)
+    v_fp8_flat, s_v = per_head_symmetric_quant(v_flat, quant_dtype=dtype)
+    paged_k_fp8 = k_fp8_flat.view(total_pages, page_size, num_kv_heads, head_dim)
+    paged_v_fp8 = v_fp8_flat.view(total_pages, page_size, num_kv_heads, head_dim)
+
+    # Run FP8 batch prefill with paged KV cache
+    wrapper_fp8 = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+        torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda"),
+        "NHD",
+        backend="fa3",
+    )
+    wrapper_fp8.plan(
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_last_page_len,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        page_size,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+        o_data_type=o_dtype,
+        causal=causal,
+    )
+    o_fp8 = wrapper_fp8.run(q_fp8, (paged_k_fp8, paged_v_fp8), s_q, s_k, s_v)
+    print(o_ref, o_fp8)
+
+    # Compute MSE - with per-head varying K/V data, head offset bugs will cause high MSE
+    # because reading head 0's data for head i gives wrong scale magnitude
+    mse = torch.mean((o_ref.float() - o_fp8.float()) ** 2)
+    assert mse < 1.0, f"MSE too high: {mse.item()}"
+
+
+# Test batch prefill with paged KV cache and GQA (grouped query attention)
+# GQA has num_qo_heads > num_kv_heads, this tests the head offset calculation more thoroughly
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("num_qo_heads,num_kv_heads", [(32, 8), (16, 4), (8, 2)])
+@pytest.mark.parametrize("head_dim", [128])
+@pytest.mark.parametrize("causal", [True])
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn])
+def test_batch_prefill_paged_gqa(
+    batch_size, num_qo_heads, num_kv_heads, head_dim, causal, dtype
+):
+    """Test FP8 batch prefill with paged KV cache using grouped query attention (GQA).
+
+    GQA is important to test because:
+    1. It exercises the head mapping logic (multiple Q heads share one KV head)
+    2. It verifies kv_head_idx calculation is correct for different group sizes
+    3. The per-head varying K/V data makes head offset bugs highly visible
+    """
+    if not is_sm90a_supported(torch.device("cuda")):
+        pytest.skip("SM90A is not supported")
+
+    print(
+        f"Testing FP8 batch prefill paged GQA with batch_size={batch_size}, "
+        f"num_qo_heads={num_qo_heads}, num_kv_heads={num_kv_heads}, "
+        f"head_dim={head_dim}, causal={causal}, dtype={dtype}"
+    )
+
+    # Setup
+    o_dtype = torch.half
+    page_size = 16
+
+    # Create variable length sequences
+    torch.manual_seed(0)
+    qo_lens = [128 * (i + 1) for i in range(batch_size)]
+    kv_lens = [128 * (i + 1) for i in range(batch_size)]
+
+    # Build indptr for Q
+    qo_indptr = torch.tensor(
+        [0] + [sum(qo_lens[: i + 1]) for i in range(batch_size)],
+        dtype=torch.int32,
+        device="cuda",
+    )
+
+    total_qo_len = sum(qo_lens)
+
+    # Compute number of pages needed for each sequence
+    kv_page_counts = [(kv_len + page_size - 1) // page_size for kv_len in kv_lens]
+    total_pages = sum(kv_page_counts)
+
+    # Build paged KV indptr and indices
+    kv_indptr = torch.tensor(
+        [0] + [sum(kv_page_counts[: i + 1]) for i in range(batch_size)],
+        dtype=torch.int32,
+        device="cuda",
+    )
+    kv_indices = torch.arange(total_pages, dtype=torch.int32, device="cuda")
+    kv_last_page_len = torch.tensor(
+        [
+            kv_len % page_size if kv_len % page_size != 0 else page_size
+            for kv_len in kv_lens
+        ],
+        dtype=torch.int32,
+        device="cuda",
+    )
+
+    # Create input tensors (fp16)
+    q_fp16 = torch.randn(
+        total_qo_len, num_qo_heads, head_dim, dtype=torch.half, device="cuda"
+    )
+    # Paged KV cache: (num_pages, page_size, num_kv_heads, head_dim)
+    # Use per-head varying scale to reveal head offset bugs
+    paged_k_fp16 = create_per_head_varying_kv(
+        (total_pages, page_size, num_kv_heads, head_dim),
+        num_kv_heads,
+        head_dim,
+        torch.half,
+        "cuda",
+    )
+    paged_v_fp16 = create_per_head_varying_kv(
+        (total_pages, page_size, num_kv_heads, head_dim),
+        num_kv_heads,
+        head_dim,
+        torch.half,
+        "cuda",
+    )
+
+    # Get reference output using fp16
+    wrapper_fp16 = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+        torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda"),
+        "NHD",
+        backend="fa3",
+    )
+    wrapper_fp16.plan(
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_last_page_len,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        page_size,
+        causal=causal,
+    )
+    o_ref = wrapper_fp16.run(q_fp16, (paged_k_fp16, paged_v_fp16))
+
+    # Quantize to FP8
+    q_fp8, s_q = per_head_symmetric_quant(q_fp16, quant_dtype=dtype)
+    k_flat = paged_k_fp16.view(-1, num_kv_heads, head_dim)
+    v_flat = paged_v_fp16.view(-1, num_kv_heads, head_dim)
+    k_fp8_flat, s_k = per_head_symmetric_quant(k_flat, quant_dtype=dtype)
+    v_fp8_flat, s_v = per_head_symmetric_quant(v_flat, quant_dtype=dtype)
+    paged_k_fp8 = k_fp8_flat.view(total_pages, page_size, num_kv_heads, head_dim)
+    paged_v_fp8 = v_fp8_flat.view(total_pages, page_size, num_kv_heads, head_dim)
+
+    # Run FP8 batch prefill with paged KV cache
+    wrapper_fp8 = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+        torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda"),
+        "NHD",
+        backend="fa3",
+    )
+    wrapper_fp8.plan(
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_last_page_len,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        page_size,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+        o_data_type=o_dtype,
+        causal=causal,
+    )
+    o_fp8 = wrapper_fp8.run(q_fp8, (paged_k_fp8, paged_v_fp8), s_q, s_k, s_v)
+
+    # Compute MSE
+    mse = torch.mean((o_ref.float() - o_fp8.float()) ** 2)
+    assert mse < 1.0, f"MSE too high: {mse.item()}"
+
+
+# Test both per-tensor and per-head scale types
+@pytest.mark.parametrize("scale_type", ["per_head", "per_tensor"])
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn])
+def test_batch_prefill_paged_scale_types(scale_type, dtype):
+    """Test FP8 batch prefill with both per-tensor and per-head scale types.
+
+    This test verifies that:
+    1. Per-head scale: shape [num_heads], each head has its own scale
+    2. Per-tensor scale: shape [1], single scale broadcast to all heads
+    """
+    if not is_sm90a_supported(torch.device("cuda")):
+        pytest.skip("SM90A is not supported")
+
+    print(
+        f"Testing FP8 batch prefill paged with scale_type={scale_type}, dtype={dtype}"
+    )
+
+    # Setup
+    batch_size = 2
+    num_qo_heads = num_kv_heads = 8
+    head_dim = 128
+    o_dtype = torch.half
+    page_size = 16
+    causal = True
+
+    # Create variable length sequences
+    torch.manual_seed(0)
+    qo_lens = [128 * (i + 1) for i in range(batch_size)]
+    kv_lens = [128 * (i + 1) for i in range(batch_size)]
+
+    # Build indptr for Q
+    qo_indptr = torch.tensor(
+        [0] + [sum(qo_lens[: i + 1]) for i in range(batch_size)],
+        dtype=torch.int32,
+        device="cuda",
+    )
+
+    total_qo_len = sum(qo_lens)
+
+    # Compute number of pages needed for each sequence
+    kv_page_counts = [(kv_len + page_size - 1) // page_size for kv_len in kv_lens]
+    total_pages = sum(kv_page_counts)
+
+    # Build paged KV indptr and indices
+    kv_indptr = torch.tensor(
+        [0] + [sum(kv_page_counts[: i + 1]) for i in range(batch_size)],
+        dtype=torch.int32,
+        device="cuda",
+    )
+    kv_indices = torch.arange(total_pages, dtype=torch.int32, device="cuda")
+    kv_last_page_len = torch.tensor(
+        [
+            kv_len % page_size if kv_len % page_size != 0 else page_size
+            for kv_len in kv_lens
+        ],
+        dtype=torch.int32,
+        device="cuda",
+    )
+
+    # Create input tensors (fp16)
+    q_fp16 = torch.randn(
+        total_qo_len, num_qo_heads, head_dim, dtype=torch.half, device="cuda"
+    )
+    paged_k_fp16 = create_per_head_varying_kv(
+        (total_pages, page_size, num_kv_heads, head_dim),
+        num_kv_heads,
+        head_dim,
+        torch.half,
+        "cuda",
+    )
+    paged_v_fp16 = create_per_head_varying_kv(
+        (total_pages, page_size, num_kv_heads, head_dim),
+        num_kv_heads,
+        head_dim,
+        torch.half,
+        "cuda",
+    )
+
+    # Get reference output using fp16
+    wrapper_fp16 = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+        torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda"),
+        "NHD",
+        backend="fa3",
+    )
+    wrapper_fp16.plan(
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_last_page_len,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        page_size,
+        causal=causal,
+    )
+    o_ref = wrapper_fp16.run(q_fp16, (paged_k_fp16, paged_v_fp16))
+
+    # Quantize to FP8 with the specified scale type
+    if scale_type == "per_head":
+        q_fp8, s_q = per_head_symmetric_quant(q_fp16, quant_dtype=dtype)
+        k_flat = paged_k_fp16.view(-1, num_kv_heads, head_dim)
+        v_flat = paged_v_fp16.view(-1, num_kv_heads, head_dim)
+        k_fp8_flat, s_k = per_head_symmetric_quant(k_flat, quant_dtype=dtype)
+        v_fp8_flat, s_v = per_head_symmetric_quant(v_flat, quant_dtype=dtype)
+    else:  # per_tensor
+        q_fp8, s_q = per_tensor_symmetric_quant(q_fp16, quant_dtype=dtype)
+        k_flat = paged_k_fp16.view(-1, num_kv_heads, head_dim)
+        v_flat = paged_v_fp16.view(-1, num_kv_heads, head_dim)
+        k_fp8_flat, s_k = per_tensor_symmetric_quant(k_flat, quant_dtype=dtype)
+        v_fp8_flat, s_v = per_tensor_symmetric_quant(v_flat, quant_dtype=dtype)
+
+    paged_k_fp8 = k_fp8_flat.view(total_pages, page_size, num_kv_heads, head_dim)
+    paged_v_fp8 = v_fp8_flat.view(total_pages, page_size, num_kv_heads, head_dim)
+
+    # Broadcast per-tensor scales to per-head if needed
+    # The kernel expects per-head scales, so we broadcast [1] -> [num_heads]
+    s_q_broadcast = broadcast_scale_to_per_head(s_q, num_qo_heads)
+    s_k_broadcast = broadcast_scale_to_per_head(s_k, num_kv_heads)
+    s_v_broadcast = broadcast_scale_to_per_head(s_v, num_kv_heads)
+
+    # Run FP8 batch prefill with paged KV cache
+    wrapper_fp8 = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+        torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda"),
+        "NHD",
+        backend="fa3",
+    )
+    wrapper_fp8.plan(
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_last_page_len,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        page_size,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+        o_data_type=o_dtype,
+        causal=causal,
+    )
+    o_fp8 = wrapper_fp8.run(
+        q_fp8, (paged_k_fp8, paged_v_fp8), s_q_broadcast, s_k_broadcast, s_v_broadcast
+    )
+
+    # Compute MSE
+    mse = torch.mean((o_ref.float() - o_fp8.float()) ** 2)
+    assert mse < 1.0, f"MSE too high for scale_type={scale_type}: {mse.item()}"
+
+
 if __name__ == "__main__":
+    # Test batch prefill paged
+    for batch_size in [2]:
+        for num_heads in [8]:
+            for head_dim in [128, 256]:
+                for causal in [True, False]:
+                    for dtype in [torch.float8_e4m3fn]:
+                        test_batch_prefill_paged(
+                            batch_size, num_heads, head_dim, causal, dtype
+                        )
+
+    # Test batch prefill ragged
+    for batch_size in [2]:
+        for num_heads in [8]:
+            for head_dim in [128]:
+                for causal in [True, False]:
+                    for dtype in [torch.float8_e4m3fn]:
+                        test_batch_prefill_ragged(
+                            batch_size, num_heads, head_dim, causal, dtype
+                        )
+
+    # Test block sparse attention
     for R in [4]:
         for C in [1]:
             for M in [1024]:
diff --git a/tests/attention/test_rope.py b/tests/attention/test_rope.py
index da59223a4f..8e694088e5 100644
--- a/tests/attention/test_rope.py
+++ b/tests/attention/test_rope.py
@@ -394,6 +394,10 @@ def test_generalized_rope_quantize(
 ):
     """Test generalized rope + quantization for MLA, GQA, and MHA architectures."""
     device = "cuda:0"
+    # Fixed seed for reproducibility across tests
+    torch.manual_seed(0)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(0)
     total_dim = rope_dim + no_rope_dim
 
     # Create input tensors based on attention type
@@ -481,6 +485,893 @@ def test_generalized_rope_quantize(
     )
 
 
+@pytest.mark.parametrize(
+    "attention_type,num_qo_heads,num_kv_heads,rope_dim,no_rope_dim",
+    [
+        # MLA: Multiple Q heads, single shared K/V head
+        ("mla", 128, 1, 64, 512),
+        ("mla", 64, 1, 128, 256),
+        ("mla", 128, 1, 64, 128),  # Explicit DeepSeek R1 MLA config case
+        ("mla", 32, 1, 32, 96),
+        # GQA: Multiple Q heads, fewer K/V heads (grouped)
+        ("gqa", 32, 8, 64, 64),
+        ("gqa", 64, 16, 128, 128),
+        ("gqa", 24, 6, 32, 96),
+        ("gqa", 32, 8, 128, 0),  # Llama3 8B standard config
+        ("gqa", 64, 8, 128, 0),  # Llama3 70B standard config
+        ("gqa", 64, 8, 64, 0),  # (plausible) GPT-OSS config
+        # MHA: Equal Q and K/V heads
+        ("mha", 32, 32, 64, 64),
+        ("mha", 16, 16, 128, 128),
+        ("mha", 8, 8, 32, 96),
+    ],
+)
+@pytest.mark.parametrize("num_tokens", [1, 19, 128, 199, 899, 2047])
+@pytest.mark.parametrize("input_dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("quant_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
+@pytest.mark.parametrize("enable_pdl", [True, False])
+@pytest.mark.parametrize("kv_layout", ["NHD", "HND"])
+@pytest.mark.parametrize("page_size", [16, 32])
+def test_generalized_rope_quantize_append_kv_cache(
+    attention_type,
+    num_qo_heads,
+    num_kv_heads,
+    rope_dim,
+    no_rope_dim,
+    num_tokens,
+    input_dtype,
+    quant_dtype,
+    enable_pdl,
+    kv_layout,
+    page_size,
+):
+    device = "cuda:0"
+    # Fixed seed for reproducibility
+    torch.manual_seed(0)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(0)
+
+    head_dim = rope_dim + no_rope_dim
+    batch_size = 4
+
+    # Build inputs following the same pattern used elsewhere
+    if attention_type == "mla":
+        # Q: (N, Hq, *), K: 2D (N, *)
+        q_rope = torch.randn(
+            num_tokens, num_qo_heads, rope_dim, dtype=input_dtype, device=device
+        )
+        q_nope = (
+            None
+            if no_rope_dim == 0
+            else torch.randn(
+                num_tokens, num_qo_heads, no_rope_dim, dtype=input_dtype, device=device
+            )
+        )
+        k_rope = torch.randn(num_tokens, rope_dim, dtype=input_dtype, device=device)
+        k_nope = (
+            None
+            if no_rope_dim == 0
+            else torch.randn(num_tokens, no_rope_dim, dtype=input_dtype, device=device)
+        )
+        v = None
+    else:
+        # GQA/MHA: K/V are 3D
+        q_rope = torch.randn(
+            num_tokens, num_qo_heads, rope_dim, dtype=input_dtype, device=device
+        )
+        q_nope = (
+            None
+            if no_rope_dim == 0
+            else torch.randn(
+                num_tokens, num_qo_heads, no_rope_dim, dtype=input_dtype, device=device
+            )
+        )
+        k_rope = torch.randn(
+            num_tokens, num_kv_heads, rope_dim, dtype=input_dtype, device=device
+        )
+        k_nope = (
+            None
+            if no_rope_dim == 0
+            else torch.randn(
+                num_tokens, num_kv_heads, no_rope_dim, dtype=input_dtype, device=device
+            )
+        )
+        v = torch.randn(
+            num_tokens, num_kv_heads, head_dim, dtype=input_dtype, device=device
+        )
+
+    # Cos/sin and positions
+    max_seq_len = 4096
+    rope_ref = FlashInferRotaryEmbedding(
+        head_dim, rope_dim, max_seq_len, 10000, False, input_dtype, device
+    )
+    pos_ids = torch.arange(num_tokens, device=device, dtype=torch.int32)
+
+    # Build paged metadata
+    kv_append_length = torch.tensor(
+        [num_tokens] + [0] * (batch_size - 1), dtype=torch.int32, device=device
+    )
+    kv_append_indptr = torch.cat(
+        [
+            torch.zeros(1, dtype=torch.int32, device=device),
+            torch.cumsum(kv_append_length, dim=0),
+        ]
+    )
+    num_pages_per_req = torch.tensor(
+        [(num_tokens + page_size - 1) // page_size] + [0] * (batch_size - 1),
+        dtype=torch.int32,
+        device=device,
+    )
+    kv_page_indptr = torch.cat(
+        [
+            torch.zeros(1, dtype=torch.int32, device=device),
+            torch.cumsum(num_pages_per_req, dim=0),
+        ]
+    )
+    kv_page_indices = torch.arange(
+        kv_page_indptr[-1].item(), dtype=torch.int32, device=device
+    )
+    kv_last_page_len = torch.tensor(
+        [num_tokens % page_size if num_tokens % page_size != 0 else page_size]
+        + [0] * (batch_size - 1),
+        dtype=torch.int32,
+        device=device,
+    )
+    # Allocate caches sized by required pages
+    max_pages = kv_page_indptr[-1].item()
+
+    # Get batch_indices and positions
+    seq_lens = flashinfer.get_seq_lens(kv_page_indptr, kv_last_page_len, page_size)
+    batch_indices, positions = flashinfer.get_batch_indices_positions(
+        kv_append_indptr, seq_lens, num_tokens
+    )
+
+    # Fused call + cache allocation
+    if attention_type == "mla":
+        ckv_cache = torch.zeros(
+            max_pages, page_size, no_rope_dim, dtype=quant_dtype, device=device
+        )
+        kpe_cache = torch.zeros(
+            max_pages, page_size, rope_dim, dtype=quant_dtype, device=device
+        )
+        q_rope_out_fused, q_nope_out_fused = (
+            flashinfer.rope.rope_quantize_fp8_append_paged_kv_cache(
+                q_rope,
+                k_rope,
+                q_nope,
+                k_nope,
+                None,
+                rope_ref.cos_sin_cache,
+                pos_ids,
+                (ckv_cache, kpe_cache),
+                kv_page_indices,
+                kv_page_indptr,
+                batch_indices,
+                positions,
+                page_size=page_size,
+                quantize_dtype=quant_dtype,
+                quant_scale_q=1.0,
+                quant_scale_kv=1.0,
+                is_neox=False,
+                enable_pdl=enable_pdl,
+            )
+        )
+    else:
+        # Allocate cache based on layout
+        if kv_layout == "NHD":
+            k_cache = torch.zeros(
+                max_pages,
+                page_size,
+                num_kv_heads,
+                head_dim,
+                dtype=quant_dtype,
+                device=device,
+            )
+            v_cache = torch.zeros(
+                max_pages,
+                page_size,
+                num_kv_heads,
+                head_dim,
+                dtype=quant_dtype,
+                device=device,
+            )
+        else:  # HND
+            k_cache = torch.zeros(
+                max_pages,
+                num_kv_heads,
+                page_size,
+                head_dim,
+                dtype=quant_dtype,
+                device=device,
+            )
+            v_cache = torch.zeros(
+                max_pages,
+                num_kv_heads,
+                page_size,
+                head_dim,
+                dtype=quant_dtype,
+                device=device,
+            )
+        q_rope_out_fused, q_nope_out_fused = (
+            flashinfer.rope.rope_quantize_fp8_append_paged_kv_cache(
+                q_rope,
+                k_rope,
+                q_nope,
+                k_nope,
+                v,
+                rope_ref.cos_sin_cache,
+                pos_ids,
+                (k_cache, v_cache),
+                kv_page_indices,
+                kv_page_indptr,
+                batch_indices,
+                positions,
+                page_size=page_size,
+                kv_layout=kv_layout,
+                quantize_dtype=quant_dtype,
+                quant_scale_q=1.0,
+                quant_scale_kv=1.0,
+                is_neox=False,
+                enable_pdl=enable_pdl,
+            )
+        )
+    # Compute reference output (handle None for no_rope_dim == 0)
+    q_in = q_rope if q_nope is None else torch.cat([q_rope, q_nope], dim=-1)
+    k_in = k_rope if k_nope is None else torch.cat([k_rope, k_nope], dim=-1)
+    q_out_f16_ref, k_out_f16_ref = rope_ref.forward_native(pos_ids, q_in, k_in)
+    q_out_f8_ref, k_out_f8_ref = map(
+        lambda x: x.to(quant_dtype),
+        (q_out_f16_ref, k_out_f16_ref),
+    )
+
+    # Fused vs Pytorch reference Q checks
+    torch.testing.assert_close(
+        q_out_f8_ref[..., :rope_dim].float(),
+        q_rope_out_fused.float(),
+        rtol=2e-1,
+        atol=1e-2,
+    )
+    torch.testing.assert_close(
+        q_out_f8_ref[..., rope_dim:].float(),
+        q_nope_out_fused.float(),
+        rtol=2e-1,
+        atol=1e-2,
+    )
+
+    # expect 1-ULP differences between FP8 device rounding and PyTorch .to(fp8)
+    if quant_dtype == torch.float8_e4m3fn:
+        rtol_val, atol_val = 0.25, 0.5
+    else:  # quant_dtype == torch.float8_e5m2:
+        rtol_val, atol_val = 0.25, 1.0
+
+    # if MLA: check ckv_cache, kpe_cache
+    if attention_type == "mla":
+        # Split K reference
+        k_rope_ref = k_out_f8_ref[..., :rope_dim]
+        k_nope_ref = k_out_f8_ref[..., rope_dim:]
+
+        ckv_ref = torch.zeros_like(ckv_cache)
+        kpe_ref = torch.zeros_like(kpe_cache)
+
+        for i in range(num_tokens):
+            b = batch_indices[i].item()
+            pos = positions[i].item()
+            page_iter = (kv_page_indptr[b].item() * page_size + pos) // page_size
+            entry_idx = (kv_page_indptr[b].item() * page_size + pos) % page_size
+            page_idx = kv_page_indices[page_iter].item()
+            ckv_ref[page_idx, entry_idx, :] = k_nope_ref[i]
+            kpe_ref[page_idx, entry_idx, :] = k_rope_ref[i]
+
+        torch.testing.assert_close(
+            ckv_cache.float(), ckv_ref.float(), rtol=rtol_val, atol=atol_val
+        )
+        torch.testing.assert_close(
+            kpe_cache.float(), kpe_ref.float(), rtol=rtol_val, atol=atol_val
+        )
+
+    # if GQA/MHA: check k_cache, v_cache
+    if attention_type == "gqa" or attention_type == "mha":
+        # K reference
+        k_ref = torch.zeros_like(k_cache)
+        for i in range(num_tokens):
+            b = batch_indices[i].item()
+            pos = positions[i].item()
+            page_iter = (kv_page_indptr[b].item() * page_size + pos) // page_size
+            entry_idx = (kv_page_indptr[b].item() * page_size + pos) % page_size
+            page_idx = kv_page_indices[page_iter].item()
+            if kv_layout == "NHD":
+                k_ref[page_idx, entry_idx, :, :] = k_out_f8_ref[i]  # [Hkv, head_dim]
+            else:  # HND
+                k_ref[page_idx, :, entry_idx, :] = k_out_f8_ref[i]  # [Hkv, head_dim]
+
+        torch.testing.assert_close(
+            k_cache.float(), k_ref.float(), rtol=rtol_val, atol=atol_val
+        )
+
+        # V reference (no RoPE on V; same quant scale as KV)
+        quant_scale_kv = 1.0  # match fused call
+        v_ref_tokens = (v * quant_scale_kv).to(quant_dtype)
+        v_ref = torch.zeros_like(v_cache)
+        for i in range(num_tokens):
+            b = batch_indices[i].item()
+            pos = positions[i].item()
+            page_iter = (kv_page_indptr[b].item() * page_size + pos) // page_size
+            entry_idx = (kv_page_indptr[b].item() * page_size + pos) % page_size
+            page_idx = kv_page_indices[page_iter].item()
+            if kv_layout == "NHD":
+                v_ref[page_idx, entry_idx, :, :] = v_ref_tokens[i]
+            else:  # HND
+                v_ref[page_idx, :, entry_idx, :] = v_ref_tokens[i]
+
+        torch.testing.assert_close(
+            v_cache.float(), v_ref.float(), rtol=rtol_val, atol=atol_val
+        )
+
+
+@pytest.mark.parametrize(
+    "attention_type,num_qo_heads,num_kv_heads,rope_dim,no_rope_dim",
+    [
+        # MLA: Multiple Q heads, single shared K/V head
+        ("mla", 128, 1, 64, 512),
+        ("mla", 32, 1, 32, 96),
+        # GQA: Multiple Q heads, fewer K/V heads (grouped)
+        ("gqa", 32, 8, 64, 64),
+        ("gqa", 32, 8, 128, 0),  # Llama3 8B standard config
+        # MHA: Equal Q and K/V heads
+        ("mha", 32, 32, 64, 64),
+        ("mha", 16, 16, 128, 128),
+    ],
+)
+@pytest.mark.parametrize("num_existing_tokens", [10, 50])
+@pytest.mark.parametrize("num_new_tokens", [1, 8])
+@pytest.mark.parametrize("input_dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("quant_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
+@pytest.mark.parametrize("enable_pdl", [True, False])
+@pytest.mark.parametrize("kv_layout", ["NHD", "HND"])
+@pytest.mark.parametrize("page_size", [16, 32])
+def test_rope_quantize_fp8_append_paged_kv_cache_decode(
+    attention_type,
+    num_qo_heads,
+    num_kv_heads,
+    rope_dim,
+    no_rope_dim,
+    num_existing_tokens,
+    num_new_tokens,
+    input_dtype,
+    quant_dtype,
+    enable_pdl,
+    kv_layout,
+    page_size,
+):
+    """Test append to non-empty cache (decode/continuation scenario)."""
+    device = "cuda:0"
+    torch.manual_seed(42)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(42)
+
+    head_dim = rope_dim + no_rope_dim
+    batch_size = 2
+
+    # Step 1: Pre-populate cache with existing tokens
+    if attention_type == "mla":
+        q_rope_existing = torch.randn(
+            num_existing_tokens,
+            num_qo_heads,
+            rope_dim,
+            dtype=input_dtype,
+            device=device,
+        )
+        q_nope_existing = (
+            None
+            if no_rope_dim == 0
+            else torch.randn(
+                num_existing_tokens,
+                num_qo_heads,
+                no_rope_dim,
+                dtype=input_dtype,
+                device=device,
+            )
+        )
+        k_rope_existing = torch.randn(
+            num_existing_tokens, rope_dim, dtype=input_dtype, device=device
+        )
+        k_nope_existing = (
+            None
+            if no_rope_dim == 0
+            else torch.randn(
+                num_existing_tokens, no_rope_dim, dtype=input_dtype, device=device
+            )
+        )
+        v_existing = None
+    else:
+        q_rope_existing = torch.randn(
+            num_existing_tokens,
+            num_qo_heads,
+            rope_dim,
+            dtype=input_dtype,
+            device=device,
+        )
+        q_nope_existing = (
+            None
+            if no_rope_dim == 0
+            else torch.randn(
+                num_existing_tokens,
+                num_qo_heads,
+                no_rope_dim,
+                dtype=input_dtype,
+                device=device,
+            )
+        )
+        k_rope_existing = torch.randn(
+            num_existing_tokens,
+            num_kv_heads,
+            rope_dim,
+            dtype=input_dtype,
+            device=device,
+        )
+        k_nope_existing = (
+            None
+            if no_rope_dim == 0
+            else torch.randn(
+                num_existing_tokens,
+                num_kv_heads,
+                no_rope_dim,
+                dtype=input_dtype,
+                device=device,
+            )
+        )
+        v_existing = torch.randn(
+            num_existing_tokens,
+            num_kv_heads,
+            head_dim,
+            dtype=input_dtype,
+            device=device,
+        )
+
+    # Create RoPE reference
+    max_seq_len = 4096
+    rope_ref = FlashInferRotaryEmbedding(
+        head_dim, rope_dim, max_seq_len, 10000, False, input_dtype, device
+    )
+    pos_ids_existing = torch.arange(
+        num_existing_tokens, device=device, dtype=torch.int32
+    )
+
+    # Build metadata for existing tokens (single request for simplicity)
+    kv_append_length_existing = torch.tensor(
+        [num_existing_tokens] + [0] * (batch_size - 1), dtype=torch.int32, device=device
+    )
+    kv_append_indptr_existing = torch.cat(
+        [
+            torch.zeros(1, dtype=torch.int32, device=device),
+            torch.cumsum(kv_append_length_existing, dim=0),
+        ]
+    )
+    num_pages_existing = (num_existing_tokens + page_size - 1) // page_size
+    kv_page_indptr_existing = torch.tensor(
+        [0, num_pages_existing] + [num_pages_existing] * (batch_size - 1),
+        dtype=torch.int32,
+        device=device,
+    )
+    kv_page_indices_existing = torch.arange(
+        num_pages_existing, dtype=torch.int32, device=device
+    )
+    kv_last_page_len_existing = torch.tensor(
+        [
+            num_existing_tokens % page_size
+            if num_existing_tokens % page_size != 0
+            else page_size
+        ]
+        + [0] * (batch_size - 1),
+        dtype=torch.int32,
+        device=device,
+    )
+    seq_lens_existing = flashinfer.get_seq_lens(
+        kv_page_indptr_existing, kv_last_page_len_existing, page_size
+    )
+    batch_indices_existing, positions_existing = flashinfer.get_batch_indices_positions(
+        kv_append_indptr_existing, seq_lens_existing, num_existing_tokens
+    )
+
+    # Allocate cache sized for existing + new tokens
+    total_tokens = num_existing_tokens + num_new_tokens
+    max_pages = (total_tokens + page_size - 1) // page_size
+
+    if attention_type == "mla":
+        ckv_cache = torch.zeros(
+            max_pages, page_size, no_rope_dim, dtype=quant_dtype, device=device
+        )
+        kpe_cache = torch.zeros(
+            max_pages, page_size, rope_dim, dtype=quant_dtype, device=device
+        )
+        # Pre-populate with existing tokens
+        _, _ = flashinfer.rope.rope_quantize_fp8_append_paged_kv_cache(
+            q_rope_existing,
+            k_rope_existing,
+            q_nope_existing,
+            k_nope_existing,
+            None,
+            rope_ref.cos_sin_cache,
+            pos_ids_existing,
+            (ckv_cache, kpe_cache),
+            kv_page_indices_existing,
+            kv_page_indptr_existing,
+            batch_indices_existing,
+            positions_existing,
+            page_size=page_size,
+            quantize_dtype=quant_dtype,
+            quant_scale_q=1.0,
+            quant_scale_kv=1.0,
+            is_neox=False,
+            enable_pdl=enable_pdl,
+        )
+    else:
+        if kv_layout == "NHD":
+            k_cache = torch.zeros(
+                max_pages,
+                page_size,
+                num_kv_heads,
+                head_dim,
+                dtype=quant_dtype,
+                device=device,
+            )
+            v_cache = torch.zeros(
+                max_pages,
+                page_size,
+                num_kv_heads,
+                head_dim,
+                dtype=quant_dtype,
+                device=device,
+            )
+        else:  # HND
+            k_cache = torch.zeros(
+                max_pages,
+                num_kv_heads,
+                page_size,
+                head_dim,
+                dtype=quant_dtype,
+                device=device,
+            )
+            v_cache = torch.zeros(
+                max_pages,
+                num_kv_heads,
+                page_size,
+                head_dim,
+                dtype=quant_dtype,
+                device=device,
+            )
+        # Pre-populate with existing tokens
+        _, _ = flashinfer.rope.rope_quantize_fp8_append_paged_kv_cache(
+            q_rope_existing,
+            k_rope_existing,
+            q_nope_existing,
+            k_nope_existing,
+            v_existing,
+            rope_ref.cos_sin_cache,
+            pos_ids_existing,
+            (k_cache, v_cache),
+            kv_page_indices_existing,
+            kv_page_indptr_existing,
+            batch_indices_existing,
+            positions_existing,
+            page_size=page_size,
+            kv_layout=kv_layout,
+            quantize_dtype=quant_dtype,
+            quant_scale_q=1.0,
+            quant_scale_kv=1.0,
+            is_neox=False,
+            enable_pdl=enable_pdl,
+        )
+
+    # Step 2: Append new tokens to the pre-populated cache
+    if attention_type == "mla":
+        q_rope_new = torch.randn(
+            num_new_tokens, num_qo_heads, rope_dim, dtype=input_dtype, device=device
+        )
+        q_nope_new = (
+            None
+            if no_rope_dim == 0
+            else torch.randn(
+                num_new_tokens,
+                num_qo_heads,
+                no_rope_dim,
+                dtype=input_dtype,
+                device=device,
+            )
+        )
+        k_rope_new = torch.randn(
+            num_new_tokens, rope_dim, dtype=input_dtype, device=device
+        )
+        k_nope_new = (
+            None
+            if no_rope_dim == 0
+            else torch.randn(
+                num_new_tokens, no_rope_dim, dtype=input_dtype, device=device
+            )
+        )
+        v_new = None
+    else:
+        q_rope_new = torch.randn(
+            num_new_tokens, num_qo_heads, rope_dim, dtype=input_dtype, device=device
+        )
+        q_nope_new = (
+            None
+            if no_rope_dim == 0
+            else torch.randn(
+                num_new_tokens,
+                num_qo_heads,
+                no_rope_dim,
+                dtype=input_dtype,
+                device=device,
+            )
+        )
+        k_rope_new = torch.randn(
+            num_new_tokens, num_kv_heads, rope_dim, dtype=input_dtype, device=device
+        )
+        k_nope_new = (
+            None
+            if no_rope_dim == 0
+            else torch.randn(
+                num_new_tokens,
+                num_kv_heads,
+                no_rope_dim,
+                dtype=input_dtype,
+                device=device,
+            )
+        )
+        v_new = torch.randn(
+            num_new_tokens, num_kv_heads, head_dim, dtype=input_dtype, device=device
+        )
+
+    pos_ids_new = torch.arange(
+        num_existing_tokens,
+        num_existing_tokens + num_new_tokens,
+        device=device,
+        dtype=torch.int32,
+    )
+
+    # Build metadata for new tokens (continue appending to first request)
+    num_pages_new_needed = (total_tokens + page_size - 1) // page_size
+    kv_page_indptr_new = torch.tensor(
+        [0, num_pages_new_needed] + [num_pages_new_needed] * (batch_size - 1),
+        dtype=torch.int32,
+        device=device,
+    )
+    kv_page_indices_new = torch.arange(
+        num_pages_new_needed, dtype=torch.int32, device=device
+    )
+    # For continuation, positions start at num_existing_tokens
+    batch_indices_new = torch.zeros(num_new_tokens, device=device, dtype=torch.int32)
+    positions_new = torch.arange(
+        num_existing_tokens,
+        num_existing_tokens + num_new_tokens,
+        device=device,
+        dtype=torch.int32,
+    )
+
+    # Snapshot existing cache for later comparison
+    if attention_type == "mla":
+        ckv_cache_before = ckv_cache.clone()
+        kpe_cache_before = kpe_cache.clone()
+    else:
+        k_cache_before = k_cache.clone()
+        v_cache_before = v_cache.clone()
+
+    # Append new tokens
+    if attention_type == "mla":
+        q_rope_out_new, q_nope_out_new = (
+            flashinfer.rope.rope_quantize_fp8_append_paged_kv_cache(
+                q_rope_new,
+                k_rope_new,
+                q_nope_new,
+                k_nope_new,
+                None,
+                rope_ref.cos_sin_cache,
+                pos_ids_new,
+                (ckv_cache, kpe_cache),
+                kv_page_indices_new,
+                kv_page_indptr_new,
+                batch_indices_new,
+                positions_new,
+                page_size=page_size,
+                quantize_dtype=quant_dtype,
+                quant_scale_q=1.0,
+                quant_scale_kv=1.0,
+                is_neox=False,
+                enable_pdl=enable_pdl,
+            )
+        )
+    else:
+        q_rope_out_new, q_nope_out_new = (
+            flashinfer.rope.rope_quantize_fp8_append_paged_kv_cache(
+                q_rope_new,
+                k_rope_new,
+                q_nope_new,
+                k_nope_new,
+                v_new,
+                rope_ref.cos_sin_cache,
+                pos_ids_new,
+                (k_cache, v_cache),
+                kv_page_indices_new,
+                kv_page_indptr_new,
+                batch_indices_new,
+                positions_new,
+                page_size=page_size,
+                kv_layout=kv_layout,
+                quantize_dtype=quant_dtype,
+                quant_scale_q=1.0,
+                quant_scale_kv=1.0,
+                is_neox=False,
+                enable_pdl=enable_pdl,
+            )
+        )
+
+    # Verify Q outputs for new tokens (handle None for no_rope_dim == 0)
+    q_in_new = (
+        q_rope_new
+        if q_nope_new is None
+        else torch.cat([q_rope_new, q_nope_new], dim=-1)
+    )
+    k_in_new = (
+        k_rope_new
+        if k_nope_new is None
+        else torch.cat([k_rope_new, k_nope_new], dim=-1)
+    )
+    q_out_f16_ref_new, k_out_f16_ref_new = rope_ref.forward_native(
+        pos_ids_new, q_in_new, k_in_new
+    )
+    q_out_f8_ref_new = q_out_f16_ref_new.to(quant_dtype)
+    k_out_f8_ref_new = k_out_f16_ref_new.to(quant_dtype)
+
+    torch.testing.assert_close(
+        q_out_f8_ref_new[..., :rope_dim].float(),
+        q_rope_out_new.float(),
+        rtol=2e-1,
+        atol=1e-2,
+    )
+    torch.testing.assert_close(
+        q_out_f8_ref_new[..., rope_dim:].float(),
+        q_nope_out_new.float(),
+        rtol=2e-1,
+        atol=1e-2,
+    )
+
+    # FP8 tolerances
+    if quant_dtype == torch.float8_e4m3fn:
+        rtol_val, atol_val = 0.25, 0.5
+    else:
+        rtol_val, atol_val = 0.25, 1.0
+
+    # Verify existing cache entries remain unchanged
+    if attention_type == "mla":
+        # Check that entries before num_existing_tokens are unchanged
+        for i in range(num_existing_tokens):
+            b = batch_indices_existing[i].item()
+            pos = positions_existing[i].item()
+            page_iter = (
+                kv_page_indptr_existing[b].item() * page_size + pos
+            ) // page_size
+            entry_idx = (
+                kv_page_indptr_existing[b].item() * page_size + pos
+            ) % page_size
+            page_idx = kv_page_indices_existing[page_iter].item()
+            torch.testing.assert_close(
+                ckv_cache[page_idx, entry_idx, :].float(),
+                ckv_cache_before[page_idx, entry_idx, :].float(),
+                rtol=0,
+                atol=0,
+                msg=f"Existing CKV cache entry {i} was modified",
+            )
+            torch.testing.assert_close(
+                kpe_cache[page_idx, entry_idx, :].float(),
+                kpe_cache_before[page_idx, entry_idx, :].float(),
+                rtol=0,
+                atol=0,
+                msg=f"Existing KPE cache entry {i} was modified",
+            )
+    else:
+        for i in range(num_existing_tokens):
+            b = batch_indices_existing[i].item()
+            pos = positions_existing[i].item()
+            page_iter = (
+                kv_page_indptr_existing[b].item() * page_size + pos
+            ) // page_size
+            entry_idx = (
+                kv_page_indptr_existing[b].item() * page_size + pos
+            ) % page_size
+            page_idx = kv_page_indices_existing[page_iter].item()
+            if kv_layout == "NHD":
+                torch.testing.assert_close(
+                    k_cache[page_idx, entry_idx, :, :].float(),
+                    k_cache_before[page_idx, entry_idx, :, :].float(),
+                    rtol=0,
+                    atol=0,
+                    msg=f"Existing K cache entry {i} was modified",
+                )
+                torch.testing.assert_close(
+                    v_cache[page_idx, entry_idx, :, :].float(),
+                    v_cache_before[page_idx, entry_idx, :, :].float(),
+                    rtol=0,
+                    atol=0,
+                    msg=f"Existing V cache entry {i} was modified",
+                )
+            else:  # HND
+                torch.testing.assert_close(
+                    k_cache[page_idx, :, entry_idx, :].float(),
+                    k_cache_before[page_idx, :, entry_idx, :].float(),
+                    rtol=0,
+                    atol=0,
+                    msg=f"Existing K cache entry {i} was modified",
+                )
+                torch.testing.assert_close(
+                    v_cache[page_idx, :, entry_idx, :].float(),
+                    v_cache_before[page_idx, :, entry_idx, :].float(),
+                    rtol=0,
+                    atol=0,
+                    msg=f"Existing V cache entry {i} was modified",
+                )
+
+    # Verify new cache entries are correct
+    if attention_type == "mla":
+        k_rope_ref_new = k_out_f8_ref_new[..., :rope_dim]
+        k_nope_ref_new = k_out_f8_ref_new[..., rope_dim:]
+
+        for i in range(num_new_tokens):
+            b = batch_indices_new[i].item()
+            pos = positions_new[i].item()
+            page_iter = (kv_page_indptr_new[b].item() * page_size + pos) // page_size
+            entry_idx = (kv_page_indptr_new[b].item() * page_size + pos) % page_size
+            page_idx = kv_page_indices_new[page_iter].item()
+            torch.testing.assert_close(
+                ckv_cache[page_idx, entry_idx, :].float(),
+                k_nope_ref_new[i].float(),
+                rtol=rtol_val,
+                atol=atol_val,
+            )
+            torch.testing.assert_close(
+                kpe_cache[page_idx, entry_idx, :].float(),
+                k_rope_ref_new[i].float(),
+                rtol=rtol_val,
+                atol=atol_val,
+            )
+    else:
+        quant_scale_kv = 1.0
+        v_ref_tokens_new = (v_new * quant_scale_kv).to(quant_dtype)
+
+        for i in range(num_new_tokens):
+            b = batch_indices_new[i].item()
+            pos = positions_new[i].item()
+            page_iter = (kv_page_indptr_new[b].item() * page_size + pos) // page_size
+            entry_idx = (kv_page_indptr_new[b].item() * page_size + pos) % page_size
+            page_idx = kv_page_indices_new[page_iter].item()
+            if kv_layout == "NHD":
+                torch.testing.assert_close(
+                    k_cache[page_idx, entry_idx, :, :].float(),
+                    k_out_f8_ref_new[i].float(),
+                    rtol=rtol_val,
+                    atol=atol_val,
+                )
+                torch.testing.assert_close(
+                    v_cache[page_idx, entry_idx, :, :].float(),
+                    v_ref_tokens_new[i].float(),
+                    rtol=rtol_val,
+                    atol=atol_val,
+                )
+            else:  # HND
+                torch.testing.assert_close(
+                    k_cache[page_idx, :, entry_idx, :].float(),
+                    k_out_f8_ref_new[i].float(),
+                    rtol=rtol_val,
+                    atol=atol_val,
+                )
+                torch.testing.assert_close(
+                    v_cache[page_idx, :, entry_idx, :].float(),
+                    v_ref_tokens_new[i].float(),
+                    rtol=rtol_val,
+                    atol=atol_val,
+                )
+
+
 @pytest.mark.parametrize("num_tokens", [1, 19, 128, 199, 899, 2047])
 @pytest.mark.parametrize("input_dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("quant_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
@@ -492,6 +1383,10 @@ def test_mla_rope_quantize(
     enable_pdl,
 ):
     device = "cuda:0"
+    # Fixed seed for reproducibility across tests
+    torch.manual_seed(0)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(0)
     num_qo_heads = 128
     q_in = torch.randn(num_tokens, num_qo_heads, 576, dtype=input_dtype, device=device)
     k_in = torch.randn(num_tokens, 576, dtype=input_dtype, device=device)
diff --git a/tests/attention/test_trtllm_gen_attention.py b/tests/attention/test_trtllm_gen_attention.py
index 7ce086a6ac..dd0002ff06 100755
--- a/tests/attention/test_trtllm_gen_attention.py
+++ b/tests/attention/test_trtllm_gen_attention.py
@@ -91,34 +91,62 @@ def create_query_tensor(q_lens, num_qo_heads, head_dim, q_dtype):
 
 
 def create_kv_cache(
-    batch_size, seq_lens, page_size, num_kv_heads, head_dim, kv_dtype, ref_kv_dtype
+    batch_size,
+    seq_lens,
+    page_size,
+    num_kv_heads,
+    head_dim,
+    kv_dtype,
+    ref_kv_dtype,
+    kv_layout="HND",
 ):
     # Create separate K and V caches
     max_seq_len = torch.max(seq_lens).item()
-    num_tokens = max_seq_len * batch_size
-    num_pages = (num_tokens + page_size - 1) // page_size
+    num_pages_per_seq = (max_seq_len + page_size - 1) // page_size
+    num_pages = num_pages_per_seq * batch_size
     ref_kv_dtype_torch = DTYPE_MAP[ref_kv_dtype]
     if kv_dtype != "fp8":  # for fp8, create with high precision to generate scale.
         assert kv_dtype == ref_kv_dtype, (
             "kv_dtype and ref_kv_dtype must be the same for non-fp8 kv_cache"
         )
 
-    k_cache = torch.randn(
-        num_pages,
-        num_kv_heads,
-        page_size,
-        head_dim,
-        dtype=ref_kv_dtype_torch,
-        device=GPU_DEVICE,
-    )
-    v_cache = torch.randn(
-        num_pages,
-        num_kv_heads,
-        page_size,
-        head_dim,
-        dtype=ref_kv_dtype_torch,
-        device=GPU_DEVICE,
-    )
+    # Create cache with appropriate layout
+    if kv_layout == "HND":
+        # HND layout: [num_pages, num_kv_heads, page_size, head_dim]
+        k_cache = torch.randn(
+            num_pages,
+            num_kv_heads,
+            page_size,
+            head_dim,
+            dtype=ref_kv_dtype_torch,
+            device=GPU_DEVICE,
+        )
+        v_cache = torch.randn(
+            num_pages,
+            num_kv_heads,
+            page_size,
+            head_dim,
+            dtype=ref_kv_dtype_torch,
+            device=GPU_DEVICE,
+        )
+    else:  # NHD layout
+        # NHD layout: [num_pages, page_size, num_kv_heads, head_dim]
+        k_cache = torch.randn(
+            num_pages,
+            page_size,
+            num_kv_heads,
+            head_dim,
+            dtype=ref_kv_dtype_torch,
+            device=GPU_DEVICE,
+        )
+        v_cache = torch.randn(
+            num_pages,
+            page_size,
+            num_kv_heads,
+            head_dim,
+            dtype=ref_kv_dtype_torch,
+            device=GPU_DEVICE,
+        )
 
     # Convert K and V separately to fp8 if needed
     if kv_dtype == "fp8":
@@ -173,6 +201,7 @@ def flatten_paged_kv(
     seq_lens: torch.Tensor,
     page_size: int,
     kv_last_page_len: torch.Tensor,
+    kv_layout: str = "HND",
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """Build flat K/V and token-level indptr from paged KV cache and page table."""
     device = ref_kv_cache.device
@@ -192,11 +221,20 @@ def flatten_paged_kv(
             page_id = int(page_table_cpu[i, j].item())
             k_page = ref_kv_cache[page_id, 0]
             v_page = ref_kv_cache[page_id, 1]
-            if j == pages_i - 1:
-                k_page = k_page[:, :last_len_i, :]
-                v_page = v_page[:, :last_len_i, :]
-            k_list.append(einops.rearrange(k_page, "h p d -> p h d"))
-            v_list.append(einops.rearrange(v_page, "h p d -> p h d"))
+            if kv_layout == "HND":
+                # HND layout: [num_kv_heads, page_size, head_dim]
+                if j == pages_i - 1:
+                    k_page = k_page[:, :last_len_i, :]
+                    v_page = v_page[:, :last_len_i, :]
+                k_list.append(einops.rearrange(k_page, "h p d -> p h d"))
+                v_list.append(einops.rearrange(v_page, "h p d -> p h d"))
+            else:  # NHD layout
+                # NHD layout: [page_size, num_kv_heads, head_dim]
+                if j == pages_i - 1:
+                    k_page = k_page[:last_len_i, :, :]
+                    v_page = v_page[:last_len_i, :, :]
+                k_list.append(einops.rearrange(k_page, "p h d -> p h d"))
+                v_list.append(einops.rearrange(v_page, "p h d -> p h d"))
     k_flat = torch.cat(k_list, dim=0)
     v_flat = torch.cat(v_list, dim=0)
     kv_indptr_tokens = torch.cat(
@@ -301,39 +339,67 @@ def unpack_compare_nvfp4(
     return output_unpacked, output_ref
 
 
-@pytest.mark.parametrize("kv_layout", ["HND"])  # trtllm-gen only support HND
-@pytest.mark.parametrize(
-    "batch_size,page_size,num_kv_heads,head_grp_size",
-    [
-        (4, 16, 2, 1),
-        (4, 32, 4, 5),
-        (4, 64, 4, 8),
-        (128, 16, 2, 5),
-        (128, 32, 4, 1),
-        (128, 64, 2, 8),
-        (256, 16, 4, 8),
-        (256, 32, 2, 8),
-        (256, 64, 4, 1),
-        (256, 64, 4, 5),
-    ],
-)
-@pytest.mark.parametrize("window_left", [-1])  # todo(Siyuan): add 127 window_left
-@pytest.mark.parametrize(
-    "q_dtype,kv_dtype,o_dtype",
-    [
-        ("bf16", "bf16", "bf16"),
-        ("fp16", "fp16", "fp16"),
-        ("fp8", "fp8", "bf16"),
-        ("fp8", "fp8", "fp16"),
-        ("fp8", "fp8", "fp8"),
-        ("fp8", "fp8", "nvfp4"),
-    ],
-)
-@pytest.mark.parametrize("enable_pdl", [True, False, None])
-@pytest.mark.parametrize("enable_sink", [True, False])
-@pytest.mark.parametrize("max_q_len", [511])
-@pytest.mark.parametrize("max_kv_len", [2047])
-def test_trtllm_batch_prefill(
+def generate_causal_mask(
+    batch_size: int,
+    q_seq_len: int,
+    device: torch.device,
+) -> torch.Tensor:
+    """
+    Generate causal attention mask for speculative decoding.
+
+    Parameters
+    ----------
+    batch_size : int
+        Batch size
+    q_seq_len : int
+        Query sequence length (number of speculative decoding tokens)
+    device : torch.device
+        Target device for the mask tensor
+
+    Returns
+    -------
+    torch.Tensor
+        Causal mask with shape [batch_size, q_seq_len, mask_size_per_row]
+        where mask_size_per_row = divUp(q_seq_len, 32) * 2 (in uint16_t units).
+        Data type: torch.uint16
+
+    """
+    num_packed_masks_per_token = (q_seq_len + 31) // 32
+
+    q_indices = torch.arange(q_seq_len, device=device, dtype=torch.int32).unsqueeze(1)
+    kv_indices = torch.arange(q_seq_len, device=device, dtype=torch.int32).unsqueeze(0)
+
+    causal_bool_mask = kv_indices <= q_indices
+
+    padded_seq_len = num_packed_masks_per_token * 32
+    if padded_seq_len > q_seq_len:
+        padding = torch.zeros(
+            q_seq_len, padded_seq_len - q_seq_len, device=device, dtype=torch.bool
+        )
+        causal_bool_mask = torch.cat([causal_bool_mask, padding], dim=1)
+
+    causal_bool_mask = causal_bool_mask.view(q_seq_len, num_packed_masks_per_token, 32)
+
+    bit_positions = torch.tensor(
+        [1 << i for i in range(32)], device=device, dtype=torch.int64
+    )
+
+    mask_uint32 = (
+        (causal_bool_mask.to(torch.int64) * bit_positions).sum(dim=-1).to(torch.uint32)
+    )
+
+    mask_uint32 = (
+        mask_uint32.unsqueeze(0)
+        .expand(batch_size, q_seq_len, num_packed_masks_per_token)
+        .contiguous()
+    )
+
+    mask_uint16 = mask_uint32.view(torch.uint16)
+
+    return mask_uint16
+
+
+def _test_trtllm_batch_prefill(
     kv_layout,
     batch_size,
     page_size,
@@ -347,13 +413,14 @@ def test_trtllm_batch_prefill(
     enable_sink,
     max_q_len,
     max_kv_len,
+    device_scale,
+    head_dim,
 ):
     compute_capability = get_compute_capability(torch.device(device="cuda"))
     if compute_capability[0] != 10:
         pytest.skip("These tests are only guaranteed to work on SM100 and SM103 GPUs.")
     # Set up test parameters
     torch.manual_seed(0)
-    head_dim = 128
 
     # Generate random sequence lengths
     num_qo_heads = num_kv_heads * head_grp_size
@@ -374,6 +441,7 @@ def test_trtllm_batch_prefill(
         head_dim,
         kv_dtype,
         "bf16" if q_dtype == "fp8" else q_dtype,
+        kv_layout,
     )
     page_table, all_page_ids, page_per_seq = create_page_table(
         batch_size, seq_lens, page_size
@@ -428,6 +496,7 @@ def test_trtllm_batch_prefill(
             seq_lens.to(GPU_DEVICE),
             page_size,
             kv_last_page_len,
+            kv_layout,
         )
         sink = torch.rand(num_qo_heads, device=GPU_DEVICE, dtype=torch.float32) * 5
         output_ref = sink_attention_unified(
@@ -445,6 +514,16 @@ def test_trtllm_batch_prefill(
         )
 
     # Run trtllm-gen function call
+    bmm1_scale = q_scale * k_scale * sm_scale
+    bmm2_scale = v_scale / o_scale
+    if isinstance(bmm1_scale, torch.Tensor) and not device_scale:
+        bmm1_scale = bmm1_scale.item()
+    elif not isinstance(bmm1_scale, torch.Tensor) and device_scale:
+        bmm1_scale = torch.tensor(bmm1_scale, device=GPU_DEVICE, dtype=torch.float32)
+    if isinstance(bmm2_scale, torch.Tensor) and not device_scale:
+        bmm2_scale = bmm2_scale.item()
+    elif not isinstance(bmm2_scale, torch.Tensor) and device_scale:
+        bmm2_scale = torch.tensor(bmm2_scale, device=GPU_DEVICE, dtype=torch.float32)
     output = flashinfer.prefill.trtllm_batch_context_with_kv_cache(
         q.contiguous(),
         kv_cache,
@@ -453,8 +532,8 @@ def test_trtllm_batch_prefill(
         seq_lens.to(GPU_DEVICE),
         torch.max(q_lens).item(),
         torch.max(seq_lens).item(),
-        q_scale * k_scale * sm_scale,  # bmm1_scale
-        v_scale / o_scale,  # bmm2_scale
+        bmm1_scale,  # bmm1_scale
+        bmm2_scale,  # bmm2_scale
         batch_size,
         q_indptr,
         kv_indptr,
@@ -463,6 +542,7 @@ def test_trtllm_batch_prefill(
         out_dtype=out_dtype,
         o_sf_scale=o_sf_scale,
         o_sf_vec_size=o_sf_vec_size,
+        kv_layout=kv_layout,
         enable_pdl=enable_pdl,
         sinks=(sink if enable_sink else None),
     )
@@ -527,11 +607,20 @@ def test_trtllm_batch_prefill(
         assert (workspace_buffer[: 8192 * 256 * 4].cpu().numpy() == 0).all()
 
 
-@pytest.mark.parametrize("kv_layout", ["HND"])  # trtllm-gen only support HND
+@pytest.mark.parametrize("kv_layout", ["HND", "NHD"])
 @pytest.mark.parametrize(
     "batch_size,page_size,num_kv_heads,head_grp_size",
     [
-        (1, 16, 8, 8),
+        (4, 16, 2, 1),
+        (4, 32, 4, 5),
+        (4, 64, 4, 8),
+        (128, 16, 2, 5),
+        (128, 32, 4, 1),
+        (128, 64, 2, 8),
+        (256, 16, 4, 8),
+        (256, 32, 2, 8),
+        (256, 64, 4, 1),
+        (256, 64, 4, 5),
     ],
 )
 @pytest.mark.parametrize("window_left", [-1])  # todo(Siyuan): add 127 window_left
@@ -539,13 +628,19 @@ def test_trtllm_batch_prefill(
     "q_dtype,kv_dtype,o_dtype",
     [
         ("bf16", "bf16", "bf16"),
+        ("fp16", "fp16", "fp16"),
+        ("fp8", "fp8", "bf16"),
+        ("fp8", "fp8", "fp16"),
+        ("fp8", "fp8", "fp8"),
+        ("fp8", "fp8", "nvfp4"),
     ],
 )
 @pytest.mark.parametrize("enable_pdl", [None])
-@pytest.mark.parametrize("enable_sink", [False])
-@pytest.mark.parametrize("max_q_len", [8192])
-@pytest.mark.parametrize("max_kv_len", [8192])
-def test_trtllm_batch_prefill_bs1(
+@pytest.mark.parametrize("enable_sink", [True, False])
+@pytest.mark.parametrize("max_q_len", [511])
+@pytest.mark.parametrize("max_kv_len", [2047])
+@pytest.mark.parametrize("head_dim", [128, 256])
+def test_trtllm_batch_prefill(
     kv_layout,
     batch_size,
     page_size,
@@ -559,8 +654,9 @@ def test_trtllm_batch_prefill_bs1(
     enable_sink,
     max_q_len,
     max_kv_len,
+    head_dim,
 ):
-    test_trtllm_batch_prefill(
+    _test_trtllm_batch_prefill(
         kv_layout,
         batch_size,
         page_size,
@@ -574,50 +670,67 @@ def test_trtllm_batch_prefill_bs1(
         enable_sink,
         max_q_len,
         max_kv_len,
+        kv_dtype == "fp8",
+        head_dim,
     )
 
 
-@pytest.mark.parametrize("kv_layout", ["HND"])  # trtllm-gen only support HND
+@pytest.mark.parametrize("kv_layout", ["HND", "NHD"])
 @pytest.mark.parametrize(
-    "batch_size,q_len_per_req,page_size,num_kv_heads,head_grp_size",
+    "batch_size,page_size,num_kv_heads,head_grp_size",
     [
-        (4, 1, 16, 2, 1),
-        (4, 1, 32, 2, 5),
-        (4, 2, 64, 2, 5),
-        (4, 3, 32, 2, 5),
-        (4, 3, 64, 2, 1),
-        (4, 4, 64, 4, 1),
-        (4, 5, 64, 4, 8),
-        (128, 1, 64, 2, 5),
-        (128, 2, 32, 4, 1),
-        (128, 3, 16, 4, 8),
-        (128, 4, 16, 2, 5),
-        (128, 5, 16, 2, 5),
-        (256, 1, 64, 4, 8),
-        (256, 2, 16, 2, 8),
-        (256, 3, 64, 4, 5),
-        (256, 4, 32, 2, 8),
-        (256, 5, 32, 2, 1),
+        (1, 16, 8, 8),
     ],
 )
-@pytest.mark.parametrize("window_left", [-1, 127])
+@pytest.mark.parametrize("window_left", [-1])  # todo(Siyuan): add 127 window_left
 @pytest.mark.parametrize(
     "q_dtype,kv_dtype,o_dtype",
     [
         ("bf16", "bf16", "bf16"),
-        ("fp16", "fp16", "fp16"),
-        ("bf16", "fp8", "bf16"),
-        ("fp16", "fp8", "fp16"),
-        ("fp8", "fp8", "bf16"),
-        ("fp8", "fp8", "fp16"),
-        ("fp8", "fp8", "fp8"),
-        ("fp8", "fp8", "nvfp4"),
     ],
 )
-@pytest.mark.parametrize("enable_pdl", [True, False, None])
-@pytest.mark.parametrize("enable_sink", [True, False])
-@pytest.mark.parametrize("max_in_kv_len", [110])
-def test_trtllm_batch_decode(
+@pytest.mark.parametrize("enable_pdl", [None])
+@pytest.mark.parametrize("enable_sink", [False])
+@pytest.mark.parametrize("max_q_len", [8192])
+@pytest.mark.parametrize("max_kv_len", [8192])
+@pytest.mark.parametrize("head_dim", [128, 256])
+def test_trtllm_batch_prefill_bs1(
+    kv_layout,
+    batch_size,
+    page_size,
+    num_kv_heads,
+    head_grp_size,
+    window_left,
+    q_dtype,
+    o_dtype,
+    kv_dtype,
+    enable_pdl,
+    enable_sink,
+    max_q_len,
+    max_kv_len,
+    head_dim,
+):
+    _test_trtllm_batch_prefill(
+        kv_layout,
+        batch_size,
+        page_size,
+        num_kv_heads,
+        head_grp_size,
+        window_left,
+        q_dtype,
+        o_dtype,
+        kv_dtype,
+        enable_pdl,
+        enable_sink,
+        max_q_len,
+        max_kv_len,
+        False,
+        head_dim,
+    )
+
+
+def _test_trtllm_batch_decode(
+    backend,
     kv_layout,
     batch_size,
     q_len_per_req,
@@ -631,18 +744,38 @@ def test_trtllm_batch_decode(
     enable_pdl,
     enable_sink,
     max_in_kv_len,
+    head_dim,
+    device_scale=False,
 ):
+    """
+    Common function for testing trtllm-gen decode.
+
+    Combinations of parameters are tested in test_trtllm_batch_decode() and test_trtllm_batch_decode_...()
+    """
     compute_capability = get_compute_capability(torch.device(device="cuda"))
-    if compute_capability[0] != 10:
-        pytest.skip("These tests are only guaranteed to work on SM100 and SM103 GPUs.")
+
+    # Check GPU architecture requirements for different backends
+    if backend == "trtllm-gen" and compute_capability[0] != 10:
+        pytest.skip("trtllm-gen backend requires SM100 and SM103 GPUs.")
+    if backend == "xqa" and compute_capability[0] < 9:
+        pytest.skip("xqa backend requires SM90+ GPUs.")
+
+    # xqa backend doesn't support nvfp4 output
+    if backend == "xqa" and o_dtype == "nvfp4":
+        pytest.skip("xqa backend does not support nvfp4 output")
+
+    if backend == "xqa" and q_dtype == "fp8":
+        pytest.skip("xqa backend only supports fp16 and bf16 query")
 
     if o_dtype == "nvfp4" and q_len_per_req > 1:
         # todo(Yingyi): add support for nvfp4 with speculative decoding
         pytest.skip("nvfp4 is not supported for q_len_per_req > 1")
 
+    if backend == "trtllm-gen" and o_dtype == "fp8" and q_dtype != "fp8":
+        pytest.skip("trtllm-gen backend only supports fp8 output for fp8 query")
+
     # Set up test parameters
     torch.manual_seed(0)
-    head_dim = 128
 
     # Generate random sequence lengths
     num_qo_heads = num_kv_heads * head_grp_size
@@ -663,6 +796,7 @@ def test_trtllm_batch_decode(
         head_dim,
         kv_dtype,
         "bf16" if q_dtype == "fp8" else q_dtype,
+        kv_layout,
     )
     page_table, all_page_ids, page_per_seq = create_page_table(
         batch_size, seq_lens, page_size
@@ -735,6 +869,7 @@ def test_trtllm_batch_decode(
             seq_lens.to(GPU_DEVICE),
             page_size,
             kv_last_page_len,
+            kv_layout,
         )
         sink = torch.rand(num_qo_heads, device=GPU_DEVICE, dtype=torch.float32) * 5
         output_ref = sink_attention_unified(
@@ -751,7 +886,22 @@ def test_trtllm_batch_decode(
             kv_indptr=kv_indptr_tokens,
         )
 
-    # Run trtllm-gen function call
+    if q_len_per_req > 1:
+        mask = generate_causal_mask(batch_size, q_len_per_req, GPU_DEVICE)
+    else:
+        mask = None
+
+    # Run decode function call with specified backend
+    bmm1_scale = q_scale * k_scale * sm_scale
+    bmm2_scale = v_scale / o_scale
+    if isinstance(bmm1_scale, torch.Tensor) and not device_scale:
+        bmm1_scale = bmm1_scale.item()
+    elif not isinstance(bmm1_scale, torch.Tensor) and device_scale:
+        bmm1_scale = torch.tensor(bmm1_scale, device=GPU_DEVICE, dtype=torch.float32)
+    if isinstance(bmm2_scale, torch.Tensor) and not device_scale:
+        bmm2_scale = bmm2_scale.item()
+    elif not isinstance(bmm2_scale, torch.Tensor) and device_scale:
+        bmm2_scale = torch.tensor(bmm2_scale, device=GPU_DEVICE, dtype=torch.float32)
     output = flashinfer.decode.trtllm_batch_decode_with_kv_cache(
         q.contiguous(),
         kv_cache,
@@ -759,20 +909,25 @@ def test_trtllm_batch_decode(
         page_table,
         seq_lens.to(GPU_DEVICE),
         torch.max(seq_lens).item(),
-        q_scale * k_scale * sm_scale,  # bmm1_scale
-        v_scale / o_scale,  # bmm2_scale
+        bmm1_scale,
+        bmm2_scale,
         window_left,  # window_left
         out=out,
         out_dtype=out_dtype,
         o_sf_scale=o_sf_scale,
         o_sf_vec_size=o_sf_vec_size,
-        enable_pdl=enable_pdl,
         sinks=(sink if enable_sink else None),
+        kv_layout=kv_layout,
+        enable_pdl=enable_pdl,
+        backend=backend,
         q_len_per_req=q_len_per_req,
+        o_scale=o_scale,
+        mask=mask,
     )
-    # check if the first 8192 * 256 * 4 bytes of workspace_buffer is zero
-    # note(Yingyi): the first 8192 * 256 * 4 bytes of workspace_buffer is the counter workspace, size might change in the future
-    assert (workspace_buffer[: 8192 * 256 * 4].cpu().numpy() == 0).all()
+    if backend == "trtllm-gen":
+        # check if the first 8192 * 256 * 4 bytes of workspace_buffer is zero
+        # note(Yingyi): the first 8192 * 256 * 4 bytes of workspace_buffer is the counter workspace, size might change in the future
+        assert (workspace_buffer[: 8192 * 256 * 4].cpu().numpy() == 0).all()
 
     if o_dtype == "nvfp4":
         output, output_ref = unpack_compare_nvfp4(
@@ -787,6 +942,10 @@ def test_trtllm_batch_decode(
     else:
         rtol, atol = 1e-2, 1e-2
 
+    if backend == "xqa" and kv_dtype == "fp8":
+        atol = 1e-1
+        rtol = 1e-1
+
     # convert to float32 for fp8 is not supported by assert_close
     # relax rtol and atol for speculative decoding test
     if q_len_per_req > 1:
@@ -806,7 +965,10 @@ def test_trtllm_batch_decode(
         max_mismatched_elements=max_mismatched_elements,
     )
 
-    if o_dtype != "nvfp4":  # wrapper api does not support fp4 output yet.
+    # Only test wrapper with trtllm-gen backend
+    if (
+        o_dtype != "nvfp4" and backend == "trtllm-gen"
+    ):  # wrapper api does not support fp4 output yet.
         # test wrapper with trtllm-gen backend
         wrapper_trtllm_gen = flashinfer.decode.BatchDecodeWithPagedKVCacheWrapper(
             workspace_buffer, kv_layout, backend="trtllm-gen"
@@ -858,11 +1020,94 @@ def test_trtllm_batch_decode(
         assert (workspace_buffer[: 8192 * 256 * 4].cpu().numpy() == 0).all()
 
 
+@pytest.mark.parametrize("backend", ["trtllm-gen", "xqa"])
+@pytest.mark.parametrize("kv_layout", ["HND", "NHD"])
+@pytest.mark.parametrize(
+    "batch_size,q_len_per_req,page_size,num_kv_heads,head_grp_size",
+    [
+        (4, 1, 16, 2, 1),
+        (4, 1, 32, 2, 5),
+        (4, 2, 64, 2, 5),
+        (4, 3, 32, 2, 5),
+        (4, 3, 64, 2, 1),
+        (4, 4, 64, 4, 1),
+        (4, 5, 64, 4, 8),
+        (128, 1, 64, 2, 5),
+        (128, 2, 32, 4, 1),
+        (128, 3, 16, 4, 8),
+        (128, 4, 16, 2, 5),
+        (128, 5, 16, 2, 5),
+        (256, 1, 64, 4, 8),
+        (256, 2, 16, 2, 8),
+        (256, 3, 64, 4, 5),
+        (256, 4, 32, 2, 8),
+        (256, 5, 32, 2, 1),
+    ],
+)
+@pytest.mark.parametrize("window_left", [-1, 127])
+@pytest.mark.parametrize(
+    "q_dtype,kv_dtype,o_dtype",
+    [
+        ("bf16", "bf16", "bf16"),
+        ("fp16", "fp16", "fp16"),
+        ("bf16", "fp8", "bf16"),
+        ("fp16", "fp8", "fp16"),
+        ("bf16", "fp8", "fp8"),
+        ("fp16", "fp8", "fp8"),
+        ("fp8", "fp8", "bf16"),
+        ("fp8", "fp8", "fp16"),
+        ("fp8", "fp8", "fp8"),
+        ("fp8", "fp8", "nvfp4"),
+    ],
+)
+@pytest.mark.parametrize("enable_pdl", [True, False, None])
+@pytest.mark.parametrize("enable_sink", [True, False])
+@pytest.mark.parametrize("max_in_kv_len", [110])
+@pytest.mark.parametrize("head_dim", [128])
+def test_trtllm_batch_decode(
+    backend,
+    kv_layout,
+    batch_size,
+    q_len_per_req,
+    page_size,
+    num_kv_heads,
+    head_grp_size,
+    window_left,
+    q_dtype,
+    o_dtype,
+    kv_dtype,
+    enable_pdl,
+    enable_sink,
+    max_in_kv_len,
+    head_dim,
+):
+    # General set of tests for trtllm-gen decode
+    _test_trtllm_batch_decode(
+        backend,
+        kv_layout,
+        batch_size,
+        q_len_per_req,
+        page_size,
+        num_kv_heads,
+        head_grp_size,
+        window_left,
+        q_dtype,
+        o_dtype,
+        kv_dtype,
+        enable_pdl,
+        enable_sink,
+        max_in_kv_len,
+        head_dim,
+        kv_dtype == "fp8",
+    )
+
+
 @pytest.mark.parametrize("kv_layout", ["HND"])  # trtllm-gen only support HND
 @pytest.mark.parametrize(
     "batch_size,q_len_per_req,page_size,num_kv_heads,head_grp_size",
     [
         (1, 1, 16, 8, 8),
+        (1, 1, 32, 8, 8),
     ],
 )
 @pytest.mark.parametrize("window_left", [-1])
@@ -874,7 +1119,9 @@ def test_trtllm_batch_decode(
 )
 @pytest.mark.parametrize("enable_pdl", [None])
 @pytest.mark.parametrize("enable_sink", [False])
-@pytest.mark.parametrize("max_in_kv_len", [8192])
+@pytest.mark.parametrize("max_in_kv_len", [4096, 8192])
+@pytest.mark.parametrize("head_dim", [128])
+@pytest.mark.parametrize("device_scale", [True, False])
 def test_trtllm_batch_decode_bs1(
     kv_layout,
     batch_size,
@@ -889,9 +1136,12 @@ def test_trtllm_batch_decode_bs1(
     enable_pdl,
     enable_sink,
     max_in_kv_len,
+    head_dim,
+    device_scale,
 ):
-    pytest.xfail("trtllm-gen decode gets incorrect output with bs1")
-    test_trtllm_batch_decode(
+    # Small number of test cases for batch size 1
+    _test_trtllm_batch_decode(
+        "trtllm-gen",
         kv_layout,
         batch_size,
         q_len_per_req,
@@ -905,6 +1155,137 @@ def test_trtllm_batch_decode_bs1(
         enable_pdl,
         enable_sink,
         max_in_kv_len,
+        head_dim,
+        device_scale,
+    )
+
+
+@pytest.mark.parametrize("kv_layout", ["HND"])  # trtllm-gen only support HND
+@pytest.mark.parametrize(
+    "batch_size,q_len_per_req,page_size,num_kv_heads,head_grp_size",
+    [
+        (4, 1, 16, 2, 1),
+        (4, 1, 32, 2, 5),
+        (4, 3, 64, 2, 1),
+        (4, 4, 64, 4, 1),
+        (128, 3, 16, 4, 8),
+        (128, 4, 16, 2, 5),
+        (256, 4, 32, 2, 8),
+        (256, 5, 32, 2, 1),
+    ],
+)
+@pytest.mark.parametrize("window_left", [-1])
+@pytest.mark.parametrize(
+    "q_dtype,kv_dtype,o_dtype",
+    [
+        ("bf16", "bf16", "bf16"),
+        ("fp16", "fp16", "fp16"),
+        ("fp8", "fp8", "fp16"),
+        ("fp8", "fp8", "fp8"),
+        ("fp8", "fp8", "nvfp4"),
+    ],
+)
+@pytest.mark.parametrize("enable_pdl", [None])
+@pytest.mark.parametrize("enable_sink", [False])
+@pytest.mark.parametrize("max_in_kv_len", [110])
+@pytest.mark.parametrize("head_dim", [256])
+@pytest.mark.parametrize("device_scale", [True, False])
+def test_trtllm_batch_decode_head_dim_256(
+    kv_layout,
+    batch_size,
+    q_len_per_req,
+    page_size,
+    num_kv_heads,
+    head_grp_size,
+    window_left,
+    q_dtype,
+    o_dtype,
+    kv_dtype,
+    enable_pdl,
+    enable_sink,
+    max_in_kv_len,
+    head_dim,
+    device_scale,
+):
+    # Small number of test cases for head_dim = 256
+    _test_trtllm_batch_decode(
+        "trtllm-gen",
+        kv_layout,
+        batch_size,
+        q_len_per_req,
+        page_size,
+        num_kv_heads,
+        head_grp_size,
+        window_left,
+        q_dtype,
+        o_dtype,
+        kv_dtype,
+        enable_pdl,
+        enable_sink,
+        max_in_kv_len,
+        head_dim,
+        device_scale,
+    )
+
+
+@pytest.mark.parametrize("kv_layout", ["HND"])  # trtllm-gen only support HND
+@pytest.mark.parametrize(
+    "batch_size,q_len_per_req,page_size,num_kv_heads,head_grp_size",
+    [
+        (1, 1, 16, 2, 1),
+        (1, 1, 32, 2, 5),
+        (1, 3, 64, 2, 1),
+        (1, 4, 64, 4, 1),
+    ],
+)
+@pytest.mark.parametrize("window_left", [-1])
+@pytest.mark.parametrize(
+    "q_dtype,kv_dtype,o_dtype",
+    [
+        ("bf16", "bf16", "bf16"),
+        ("fp8", "fp8", "fp8"),
+    ],
+)
+@pytest.mark.parametrize("enable_pdl", [None])
+@pytest.mark.parametrize("enable_sink", [False])
+@pytest.mark.parametrize("max_in_kv_len", [4096, 8192, 16384, 32768, 65536, 131072])
+@pytest.mark.parametrize("head_dim", [128])
+@pytest.mark.parametrize("device_scale", [True, False])
+def test_trtllm_batch_decode_long_sequence_length(
+    kv_layout,
+    batch_size,
+    q_len_per_req,
+    page_size,
+    num_kv_heads,
+    head_grp_size,
+    window_left,
+    q_dtype,
+    o_dtype,
+    kv_dtype,
+    enable_pdl,
+    enable_sink,
+    max_in_kv_len,
+    head_dim,
+    device_scale,
+):
+    # Small number of test cases for long sequence length
+    _test_trtllm_batch_decode(
+        "trtllm-gen",
+        kv_layout,
+        batch_size,
+        q_len_per_req,
+        page_size,
+        num_kv_heads,
+        head_grp_size,
+        window_left,
+        q_dtype,
+        o_dtype,
+        kv_dtype,
+        enable_pdl,
+        enable_sink,
+        max_in_kv_len,
+        head_dim,
+        device_scale,
     )
 
 
@@ -1053,8 +1434,3 @@ def test_trtllm_gen_prefill_deepseek_bs1(
     test_trtllm_gen_prefill_deepseek(
         batch_size, s_qo, s_kv, num_kv_heads, head_grp_size, causal
     )
-
-
-if __name__ == "__main__":
-    test_trtllm_batch_prefill("HND", 128, 32, 2, 5, -1, "fp16", "fp16", "fp16", False)
-    test_trtllm_batch_decode("HND", 256, 3, 64, 4, 5, -1, "fp8", "fp8", "fp8", True)
diff --git a/tests/attention/test_trtllm_gen_mla.py b/tests/attention/test_trtllm_gen_mla.py
index db6d827d67..d71e8cb386 100644
--- a/tests/attention/test_trtllm_gen_mla.py
+++ b/tests/attention/test_trtllm_gen_mla.py
@@ -1,7 +1,6 @@
-import math
-
 import pytest
 import torch
+import random
 
 import flashinfer
 from flashinfer.utils import get_compute_capability
@@ -11,19 +10,206 @@
 workspace_size = 128 * 1024 * 1024
 
 
-@pytest.mark.parametrize(
-    "batch_size",
-    [1, 2, 4, 16, 32, 64, 128, 256, 512, 768, 1024],
-)
-@pytest.mark.parametrize("scale", [1.0, 0.5])
-@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
-@pytest.mark.parametrize("page_size", [32, 64])
-@pytest.mark.parametrize(
-    "q_len_per_request", [1, 2]
-)  # todo(Yingyi): verify larger q_len_per_request
-@pytest.mark.parametrize("dynamic_scale", [False])
-@pytest.mark.parametrize("enable_pdl", [True, False, None])
-def test_trtllm_batch_decode_mla(
+def generate_sparse_indices(
+    batch_size: int,
+    q_len_per_request: int,
+    seq_lens: torch.Tensor,
+    topk: int,
+    page_size: int,
+    block_tables: torch.Tensor,
+    device: str,
+    seed: int = 42,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Generate sparse attention indices for MLA.
+
+    Returns:
+        abs_indices: [batch_size, q_len_per_request, topk] - absolute positions in sequence
+        indices_in_kvcache: [batch_size, q_len_per_request, topk] - positions in blocked KV cache
+    """
+    random.seed(seed)
+    torch.manual_seed(seed)
+
+    block_tables_cpu = block_tables.cpu()
+    seq_lens_cpu = seq_lens.cpu()
+
+    abs_indices = torch.empty(
+        batch_size, q_len_per_request, topk, dtype=torch.int32, device="cpu"
+    )
+    indices_in_kvcache = torch.empty(
+        batch_size, q_len_per_request, topk, dtype=torch.int32, device="cpu"
+    )
+
+    for i in range(batch_size):
+        cur_seq_len = int(seq_lens_cpu[i].item())
+        # Generate indices for each query position
+        for j in range(q_len_per_request):
+            # Randomly sample topk positions from the sequence
+            if cur_seq_len > 0:
+                # cur_abs_indices = torch.randperm(cur_seq_len, device="cpu")[:topk]
+                cur_abs_indices = torch.arange(0, topk, device="cpu")
+                # Convert to blocked indices
+                cur_blocked_indices = block_tables_cpu[
+                    i, cur_abs_indices // page_size
+                ] * page_size + (cur_abs_indices % page_size)
+            else:
+                cur_abs_indices = torch.empty(0, dtype=torch.int32, device="cpu")
+                cur_blocked_indices = torch.empty(0, dtype=torch.int32, device="cpu")
+
+            # Pad with -1 if we don't have enough indices
+            if len(cur_abs_indices) < topk:
+                pad_len = topk - len(cur_abs_indices)
+                cur_abs_indices = torch.cat(
+                    [
+                        cur_abs_indices,
+                        torch.full((pad_len,), -1, device="cpu", dtype=torch.int32),
+                    ]
+                )
+                cur_blocked_indices = torch.cat(
+                    [
+                        cur_blocked_indices,
+                        torch.full((pad_len,), -1, device="cpu", dtype=torch.int32),
+                    ]
+                )
+
+            # Randomly permute the indices
+            # perm = torch.randperm(topk, device="cpu")
+            perm = torch.arange(0, topk, device="cpu")
+            cur_abs_indices = cur_abs_indices[perm]
+            cur_blocked_indices = cur_blocked_indices[perm]
+
+            abs_indices[i, j, :] = cur_abs_indices
+            indices_in_kvcache[i, j, :] = cur_blocked_indices
+
+    return abs_indices.to(device), indices_in_kvcache.to(device)
+
+
+def sparse_mla_reference_torch(
+    cache_seqlens: torch.Tensor,  # [batch_size]
+    block_table: torch.Tensor,  # [batch_size, ?]
+    q: torch.Tensor,  # [batch_size, s_q, h_q, d]
+    blocked_k: torch.Tensor,  # [?, block_size, d]
+    blocked_v: torch.Tensor,  # [?, block_size, dv]
+    page_size: int,
+    is_causal: bool,
+    sm_scale: float,
+    indices: torch.Tensor | None = None,  # [batch_size, s_q, topk]
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    A reference implementation in PyTorch for MLA attention.
+    Based on FlashMLA's reference implementation.
+
+    Args:
+        cache_seqlens: Sequence lengths for each batch [batch_size]
+        block_table: Block table mapping [batch_size, max_num_blocks]
+        q: Query tensor [batch_size, s_q, h_q, d]
+        blocked_k: Blocked key cache [num_blocks, block_size, d]
+        blocked_v: Blocked value cache [num_blocks, block_size, dv]
+        page_size: Size of each block/page
+        is_causal: Whether to apply causal masking
+        sm_scale: Softmax scale factor
+        indices: Optional sparse indices [batch_size, s_q, topk]
+
+    Returns:
+        output: Attention output [batch_size, s_q, h_q, dv]
+        lse: Log-sum-exp values [batch_size, h_q, s_q]
+    """
+
+    def get_topk_attn_mask(s_q: int, s_k: int, indices: torch.Tensor):
+        """Create attention mask for top-k sparse attention."""
+        mask = torch.zeros(s_q, s_k, dtype=torch.bool)
+        for i in range(s_q):
+            cur_indices = indices[i]
+            valid_indices = cur_indices[cur_indices != -1]
+            mask[i, valid_indices] = True
+        return mask
+
+    def scaled_dot_product_attention(
+        batch_idx: int,
+        query: torch.Tensor,  # [h_q, s_q, d]
+        key: torch.Tensor,  # [s_k, d]
+        value: torch.Tensor,  # [s_k, dv]
+        is_causal: bool,
+        sm_scale: float,
+        indices: torch.Tensor | None,  # [s_q, topk]
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Compute scaled dot-product attention."""
+        h_q = query.size(0)
+        s_q = query.shape[-2]
+        s_k = key.shape[-2]
+        dv = value.shape[-1]
+
+        query = query.float()
+        key = key.float()
+        value = value.float()
+
+        # Handle NaN values in KV
+        key[key != key] = 0.0
+        value[value != value] = 0.0
+
+        # Compute attention weights: [h_q, s_q, s_k]
+        attn_weight = query @ key.transpose(-2, -1)
+
+        # Apply masking if needed
+        if (is_causal and query.size(1) > 1) or indices is not None:
+            mask = torch.ones(s_q, s_k, dtype=torch.bool)
+            if is_causal:
+                mask = mask.tril(diagonal=s_k - s_q)
+            if indices is not None:
+                mask &= get_topk_attn_mask(s_q, s_k, indices)
+            attn_bias = torch.zeros(s_q, s_k, dtype=torch.float, device=query.device)
+            mask = mask.to(device=query.device)
+            attn_bias.masked_fill_(mask.logical_not(), float("-inf"))
+            attn_weight += attn_bias.to(query.dtype)
+
+        # Scale and softmax
+        attn_weight *= sm_scale
+        lse = attn_weight.logsumexp(dim=-1)  # [h_q, s_q]
+        attn_weight = torch.softmax(attn_weight, dim=-1, dtype=torch.float32)
+
+        # Compute output
+        output = attn_weight @ value  # [h_q, s_q, dv]
+
+        # Correct for query tokens which have no attendable keys
+        lonely_q_mask = lse == float("-inf")
+        output[lonely_q_mask.unsqueeze(-1).broadcast_to(h_q, s_q, dv)] = 0.0
+        lse[lonely_q_mask] = float("+inf")
+
+        return output, lse
+
+    b, s_q, h_q, d = q.size()
+    dv = blocked_v.size(2)
+    cache_seqlens_cpu = cache_seqlens.cpu()
+
+    out_ref = torch.empty(b, s_q, h_q, dv, dtype=torch.float32)
+    lse_ref = torch.empty(b, h_q, s_q, dtype=torch.float32)
+
+    for i in range(b):
+        cur_len = int(cache_seqlens_cpu[i].item())
+        cur_num_blocks = (cur_len + page_size - 1) // page_size
+        cur_block_indices = block_table[i][0:cur_num_blocks]
+
+        # Gather KV for this sequence
+        cur_key = blocked_k[cur_block_indices].view(-1, d)[:cur_len, ...]
+        cur_value = blocked_v[cur_block_indices].view(-1, dv)[:cur_len, ...]
+
+        cur_out, cur_lse = scaled_dot_product_attention(
+            i,
+            q[i].transpose(0, 1),  # [h_q, s_q, d]
+            cur_key,  # [s_k, d]
+            cur_value,  # [s_k, dv]
+            is_causal,
+            sm_scale,
+            indices[i] if indices is not None else None,
+        )
+        out_ref[i] = cur_out.transpose(0, 1)
+        lse_ref[i] = cur_lse
+
+    out_ref = out_ref.to(torch.bfloat16).to(q.device)
+    return out_ref, lse_ref
+
+
+def trtllm_batch_decode_mla(
     batch_size: int,
     scale: float,
     dtype: torch.dtype,
@@ -31,19 +217,26 @@ def test_trtllm_batch_decode_mla(
     q_len_per_request: int,
     dynamic_scale: bool,
     enable_pdl: bool,
+    backend: str,
+    MAX_SEQ_LEN: int,
 ):
     compute_capability = get_compute_capability(torch.device(device="cuda"))
-    if compute_capability[0] != 10:
-        pytest.skip("These tests are only guaranteed to work on SM100 and SM103 GPUs.")
+    if backend == "xqa":
+        if compute_capability[0] != 12:
+            pytest.skip("XQA MLA only supports SM120 GPUs")
+        if q_len_per_request != 1 or dtype != torch.float8_e4m3fn:
+            pytest.skip(
+                "XQA MLA only supports q_len_per_request == 1 and dtype == torch.float8_e4m3fn"
+            )
+    if backend == "trtllm-gen":
+        if compute_capability[0] != 10:
+            pytest.skip("TRTLLM-GEN MLA only supports SM100 and SM103 GPUs")
     if dynamic_scale and dtype != torch.float8_e4m3fn:
         pytest.skip("Dynamic scale is not supported for non-fp8 dtype")
 
     torch.manual_seed(42)
     device = "cuda:0"
 
-    # Fixed max sequence length
-    MAX_SEQ_LEN = 1024
-
     # Deepseek attention config (decode-MLA)
     num_q_heads = 128
     qk_nope_head_dim = 128
@@ -72,7 +265,7 @@ def test_trtllm_batch_decode_mla(
     max_num_blocks_per_seq = blocks_per_seq.max().item()
 
     # Generate random but unique block IDs for all sequences
-    total_blocks_needed = sum(blocks_per_seq)
+    total_blocks_needed = int(blocks_per_seq.sum().item())
     all_block_ids = torch.randperm(
         total_blocks_needed, device=device
     )  # Random permutation
@@ -86,7 +279,7 @@ def test_trtllm_batch_decode_mla(
     # Populate block tables and track block assignments
     block_id = 0
     for i in range(batch_size):
-        num_blocks_needed = blocks_per_seq[i]
+        num_blocks_needed = int(blocks_per_seq[i].item())
         block_tables[i, :num_blocks_needed] = all_block_ids[
             block_id : block_id + num_blocks_needed
         ]
@@ -113,21 +306,6 @@ def test_trtllm_batch_decode_mla(
     workspace_buffer = global_trtllm_gen_fmha_workspace_buffer
     workspace_buffer_ref = global_workspace_buffer
 
-    bmm1_log2_scale_tensor = (
-        torch.tensor(
-            [scale / ((128 + 64) ** 0.5 * math.log2(math.e))],
-            dtype=torch.float32,
-            device=device,
-        )
-        if dynamic_scale
-        else None
-    )
-    bmm2_scale_tensor = (
-        torch.tensor([1.0], dtype=torch.float32, device=device)
-        if dynamic_scale
-        else None
-    )
-
     # Run decode-MLA
     output = flashinfer.decode.trtllm_batch_decode_with_kv_cache_mla(
         query=query,
@@ -141,9 +319,8 @@ def test_trtllm_batch_decode_mla(
         max_seq_len=max_seq_len,
         bmm1_scale=scale / ((128 + 64) ** 0.5),
         bmm2_scale=1.0,
-        bmm1_scale_log2_tensor=bmm1_log2_scale_tensor,
-        bmm2_scale_tensor=bmm2_scale_tensor,
         enable_pdl=enable_pdl,
+        backend=backend,
     )
     # check if the first 8192 * 256 * 4 bytes of workspace_buffer is zero
     # note(Yingyi): the first 8192 * 256 * 4 bytes of workspace_buffer is the counter workspace, size might change in the future
@@ -198,31 +375,379 @@ def test_trtllm_batch_decode_mla(
 
     o_ref = wrapper.run(q_nope, q_pe, ckv, kpe, return_lse=False)
 
-    # check is nan
-    assert not torch.isnan(o_ref).any(), "o_ref is nan"
-    assert not torch.isnan(output).any(), "output is nan"
+    if backend == "trtllm-gen":
+        # check is nan
+        assert not torch.isnan(o_ref).any(), "o_ref is nan"
+        assert not torch.isnan(output).any(), "output is nan"
+
+        if dtype == torch.float8_e4m3fn:
+            try:
+                torch.testing.assert_close(
+                    output,
+                    o_ref.view(batch_size, q_len_per_request, num_q_heads, -1),
+                    rtol=1e-1,
+                    atol=1e-1,
+                )  # todo: do reference with normal attention?
+            except AssertionError as e:
+                print("output:", output)
+                print("o_ref:", o_ref)
+                raise e
+        else:
+            try:
+                torch.testing.assert_close(
+                    output,
+                    o_ref.view(batch_size, q_len_per_request, num_q_heads, -1),
+                    rtol=1e-2,
+                    atol=1e-2,
+                )
+            except AssertionError as e:
+                print("output:", output)
+                print("o_ref:", o_ref)
+                raise e
+    elif backend == "xqa":
+        atol = 0.05
+        rtol = 0.05
+
+        diff_abs = torch.abs(
+            o_ref.view(batch_size, q_len_per_request, num_q_heads, -1) - output
+        )
+        diff_rel = diff_abs / (
+            torch.abs(o_ref.view(batch_size, q_len_per_request, num_q_heads, -1)) + 1e-8
+        )
+
+        within_tolerance = (diff_abs <= atol) | (diff_rel <= rtol)
+
+        pass_ratio = within_tolerance.float().mean().item()
+
+        required_ratio = 0.95
+        assert pass_ratio >= required_ratio, (
+            f"Total {o_ref.numel()} elements, only {pass_ratio:.1%} meet tolerance criteria, "
+            f"require at least {required_ratio:.1%}"
+        )
+
+
+@pytest.mark.parametrize(
+    "batch_size",
+    [1, 2, 4, 16, 32, 64, 128, 256, 512, 768, 1024],
+)
+@pytest.mark.parametrize("scale", [1.0, 0.5])
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
+@pytest.mark.parametrize("page_size", [32, 64])
+@pytest.mark.parametrize(
+    "q_len_per_request", [1, 2]
+)  # todo(Yingyi): verify larger q_len_per_request
+@pytest.mark.parametrize("dynamic_scale", [False])
+@pytest.mark.parametrize("enable_pdl", [True, False, None])
+@pytest.mark.parametrize("backend", ["trtllm-gen", "xqa"])
+def test_trtllm_batch_decode_mla(
+    batch_size: int,
+    scale: float,
+    dtype: torch.dtype,
+    page_size: int,
+    q_len_per_request: int,
+    dynamic_scale: bool,
+    enable_pdl: bool,
+    backend: str,
+):
+    trtllm_batch_decode_mla(
+        batch_size,
+        scale,
+        dtype,
+        page_size,
+        q_len_per_request,
+        dynamic_scale,
+        enable_pdl,
+        backend,
+        1024,
+    )
+
+
+@pytest.mark.parametrize(
+    "batch_size",
+    [2, 4, 8],
+)
+@pytest.mark.parametrize("scale", [1.0, 0.5])
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
+@pytest.mark.parametrize("page_size", [64])
+@pytest.mark.parametrize("q_len_per_request", [1, 2, 3])
+@pytest.mark.parametrize("dynamic_scale", [False])
+@pytest.mark.parametrize("enable_pdl", [True, False, None])
+@pytest.mark.parametrize("backend", ["trtllm-gen"])
+@pytest.mark.parametrize("MAX_SEQ_LEN", [1024, 8960])
+def test_dsr1_trtllm_mla(
+    batch_size: int,
+    scale: float,
+    dtype: torch.dtype,
+    page_size: int,
+    q_len_per_request: int,
+    dynamic_scale: bool,
+    enable_pdl: bool,
+    backend: str,
+    MAX_SEQ_LEN: int,
+):
+    trtllm_batch_decode_mla(
+        batch_size,
+        scale,
+        dtype,
+        page_size,
+        q_len_per_request,
+        dynamic_scale,
+        enable_pdl,
+        backend,
+        MAX_SEQ_LEN,
+    )
+
+
+@pytest.mark.parametrize(
+    "batch_size",
+    [1, 2, 4, 16, 32, 64, 128],
+)
+@pytest.mark.parametrize("scale", [1.0])
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
+@pytest.mark.parametrize("q_len_per_request", [1, 2])
+@pytest.mark.parametrize("topk", [128, 2048])
+@pytest.mark.parametrize("is_varlen", [False, True])
+@pytest.mark.parametrize("enable_pdl", [True, False, None])
+@pytest.mark.parametrize("backend", ["trtllm-gen"])
+def test_trtllm_batch_decode_mla_sparse(
+    batch_size: int,
+    scale: float,
+    dtype: torch.dtype,
+    q_len_per_request: int,
+    topk: int,
+    is_varlen: bool,
+    enable_pdl: bool,
+    backend: str,
+):
+    """
+    Test sparse MLA decoding with top-k attention.
+    Based on FlashMLA test patterns from:
+    https://github.com/deepseek-ai/FlashMLA/blob/main/tests/test_flash_mla_decoding.py
+    """
+    compute_capability = get_compute_capability(torch.device(device="cuda"))
+    if backend == "trtllm-gen":
+        if compute_capability[0] != 10:
+            pytest.skip("TRTLLM-GEN MLA only supports SM100 and SM103 GPUs")
+
+    torch.manual_seed(42)
+    device = "cuda:0"
+
+    # Deepseek attention config (decode-MLA)
+    num_q_heads = 128
+    qk_nope_head_dim = 128
+    qk_rope_head_dim = 64
+    kv_lora_rank = 512
 
+    # Fixed or variable sequence lengths
+    if is_varlen:
+        # Variable sequence lengths
+        MAX_SEQ_LEN = 4096
+        seq_lens = [
+            max(
+                topk,
+                int(
+                    torch.distributions.Normal(MAX_SEQ_LEN, MAX_SEQ_LEN / 2)
+                    .sample()
+                    .item()
+                ),
+            )
+            for _ in range(batch_size)
+        ]
+        seq_lens[-1] = MAX_SEQ_LEN  # Ensure at least one max length
+        seq_lens = [min(s, MAX_SEQ_LEN) for s in seq_lens]
+    else:
+        # Fixed sequence length
+        MAX_SEQ_LEN = 4096
+        seq_lens = [MAX_SEQ_LEN] * batch_size
+
+    max_seq_len = max(seq_lens)
+    seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int, device=device)
+
+    # Initialize query tensors
+    query = torch.randn(
+        batch_size,
+        q_len_per_request,
+        num_q_heads,
+        kv_lora_rank + qk_rope_head_dim,
+        device=device,
+    )
+    query.clamp_(min=-1.0, max=1.0)
+    query = query.to(dtype)
+
+    # Calculate blocks needed
+    page_size = 32
+    blocks_per_seq = (seq_lens_tensor + page_size - 1) // page_size
+    max_num_blocks_per_seq = blocks_per_seq.max().item()
+    total_blocks_needed = int(blocks_per_seq.sum().item())
+
+    # Generate random but unique block IDs
+    all_block_ids = torch.randperm(total_blocks_needed, device=device)
+
+    # Create block tables
+    block_tables = torch.zeros(
+        (batch_size, max_num_blocks_per_seq), dtype=torch.int, device=device
+    )
+    block_id = 0
+    for i in range(batch_size):
+        num_blocks_needed = int(blocks_per_seq[i].item())
+        block_tables[i, :num_blocks_needed] = all_block_ids[
+            block_id : block_id + num_blocks_needed
+        ]
+        block_id += num_blocks_needed
+
+    # Create KV cache
+    num_blocks = total_blocks_needed
+    kv_cache = torch.randn(
+        size=(num_blocks, page_size, kv_lora_rank + qk_rope_head_dim),
+        device=device,
+    )
+    kv_cache.clamp_(min=-1.0, max=1.0)
+    kv_cache = kv_cache.to(dtype)
+
+    # Generate sparse indices
+    abs_indices, indices_in_kvcache = generate_sparse_indices(
+        batch_size,
+        q_len_per_request,
+        seq_lens_tensor,
+        topk,
+        page_size,
+        block_tables,
+        device,
+    )
+
+    # Mask unused KV cache entries with NaN for correctness checking
+    kv_cache_ref = kv_cache.clone()
+    if dtype == torch.float8_e4m3fn:
+        kv_cache_ref = kv_cache_ref.to(torch.bfloat16)
+
+    # Mark all positions as NaN initially
+    all_indices = indices_in_kvcache.flatten().tolist()
+    all_indices = list(set(all_indices))
+    if -1 in all_indices:
+        all_indices.remove(-1)
+
+    # Only used indices should be valid
+    kv_cache_flat = kv_cache_ref.view(-1, kv_lora_rank + qk_rope_head_dim)
+    used_mask = torch.zeros(kv_cache_flat.size(0), dtype=torch.bool, device="cpu")
+    used_mask[torch.tensor(all_indices, dtype=torch.int64, device="cpu")] = True
+    kv_cache_flat[~used_mask] = float("0")
+
+    # Allocate workspace buffers
+    global global_workspace_buffer, global_trtllm_gen_fmha_workspace_buffer
+    if global_workspace_buffer is None:
+        global_workspace_buffer = torch.empty(
+            workspace_size, dtype=torch.int8, device=device
+        )
+    if global_trtllm_gen_fmha_workspace_buffer is None:
+        global_trtllm_gen_fmha_workspace_buffer = torch.zeros(
+            workspace_size, dtype=torch.int8, device=device
+        )
+    workspace_buffer = global_trtllm_gen_fmha_workspace_buffer
+    # workspace_buffer_ref = global_workspace_buffer
+
+    # Run sparse decode-MLA
+    query_input = query.clone()
+    output = flashinfer.decode.trtllm_batch_decode_with_kv_cache_mla(
+        query=query_input,
+        kv_cache=kv_cache.unsqueeze(1),
+        workspace_buffer=workspace_buffer,
+        qk_nope_head_dim=qk_nope_head_dim,
+        kv_lora_rank=kv_lora_rank,
+        qk_rope_head_dim=qk_rope_head_dim,
+        block_tables=indices_in_kvcache,
+        seq_lens=seq_lens_tensor,
+        max_seq_len=max_seq_len,
+        sparse_mla_top_k=topk,
+        bmm1_scale=scale / ((qk_nope_head_dim + qk_rope_head_dim) ** 0.5),
+        bmm2_scale=1.0,
+        enable_pdl=enable_pdl,
+        backend=backend,
+    )
+
+    # Check workspace buffer is zeroed
+    assert (workspace_buffer[: 8192 * 256 * 4].cpu().numpy() == 0).all()
+
+    # For now, just check that output has correct shape and no NaNs
+    expected_shape = (batch_size, q_len_per_request, num_q_heads, kv_lora_rank)
+    assert output.shape == expected_shape, (
+        f"Output shape {output.shape} != {expected_shape}"
+    )
+
+    # Check for NaNs
+    if dtype != torch.float8_e4m3fn:
+        assert not torch.isnan(output).any(), "Output contains NaN values"
+
+    # Generate reference output using PyTorch implementation
+    query_ref = query.clone()
     if dtype == torch.float8_e4m3fn:
+        query_ref = query_ref.to(torch.bfloat16)
+
+    # Split kv_cache into K and V components
+    # K uses full dimension (kv_lora_rank + qk_rope_head_dim)
+    # V uses only kv_lora_rank dimension
+    blocked_k = kv_cache_ref  # [num_blocks, page_size, kv_lora_rank + qk_rope_head_dim]
+    blocked_v = kv_cache_ref[
+        ..., :kv_lora_rank
+    ]  # [num_blocks, page_size, kv_lora_rank]
+
+    sm_scale = scale / ((qk_nope_head_dim + qk_rope_head_dim) ** 0.5)
+
+    out_ref, lse_ref = sparse_mla_reference_torch(
+        cache_seqlens=seq_lens_tensor,
+        block_table=block_tables,
+        q=query_ref,
+        blocked_k=blocked_k,
+        blocked_v=blocked_v,
+        page_size=page_size,
+        is_causal=True,  # Cover cases where number of attendable kv values are less than topk
+        sm_scale=sm_scale,
+        indices=abs_indices,
+    )
+
+    # Compare outputs
+    assert not torch.isnan(output).any(), "Kernel output contains NaN values"
+    assert not torch.isnan(out_ref).any(), "Reference output contains NaN values"
+
+    if dtype == torch.float8_e4m3fn:
+        # FP8 has lower precision, use more relaxed tolerances
         try:
             torch.testing.assert_close(
-                output,
-                o_ref.view(batch_size, q_len_per_request, num_q_heads, -1),
+                output.float(),
+                out_ref.float(),
                 rtol=1e-1,
                 atol=1e-1,
-            )  # todo: do reference with normal attention?
+            )
         except AssertionError as e:
-            print("output:", output)
-            print("o_ref:", o_ref)
+            # Calculate element-wise differences for debugging
+            diff = torch.abs(output.float() - out_ref.float())
+            max_diff = diff.max().item()
+            mean_diff = diff.mean().item()
+            print(f"Max difference: {max_diff}, Mean difference: {mean_diff}")
+            print(f"Output sample: {output[0, 0, 0, :8]}")
+            print(f"Reference sample: {out_ref[0, 0, 0, :8]}")
             raise e
     else:
+        # BF16 should have better precision
         try:
             torch.testing.assert_close(
-                output,
-                o_ref.view(batch_size, q_len_per_request, num_q_heads, -1),
-                rtol=1e-2,
-                atol=1e-2,
+                output.float(),
+                out_ref.float(),
+                rtol=2e-2,
+                atol=8e-4,
             )
         except AssertionError as e:
-            print("output:", output)
-            print("o_ref:", o_ref)
+            # Calculate element-wise differences for debugging
+            diff = torch.abs(output.float() - out_ref.float())
+            max_diff = diff.max().item()
+            mean_diff = diff.mean().item()
+            print(f"Max difference: {max_diff}, Mean difference: {mean_diff}")
+            print(f"Output sample: {output[0, 0, 0, :8]}")
+            print(f"Output sample: {output[0, 1, 0, :8]}")
+            print(f"Reference sample: {out_ref[0, 0, 0, :8]}")
+            print(f"Reference sample: {out_ref[0, 1, 0, :8]}")
             raise e
+
+    print(
+        f"Sparse MLA test passed: batch_size={batch_size}, topk={topk}, "
+        f"q_len={q_len_per_request}, varlen={is_varlen}, dtype={dtype}"
+    )
diff --git a/tests/attention/test_trtllm_ragged_kv_stride.py b/tests/attention/test_trtllm_ragged_kv_stride.py
new file mode 100644
index 0000000000..69c11359a7
--- /dev/null
+++ b/tests/attention/test_trtllm_ragged_kv_stride.py
@@ -0,0 +1,118 @@
+import pytest
+import torch
+
+import flashinfer
+from flashinfer.utils import get_compute_capability
+
+
+@pytest.mark.cuda
+def test_trtllm_ragged_kv_large_stride_overflow():
+    """
+    Test that ragged KV with large numel (>2^31) doesn't cause TMA descriptor error.
+
+    Constructs a scenario where key.numel() = 131072 * 128 * 192 > 2^31, which
+    triggers int32 overflow in kStrideBatch. Before the fix, this caused negative
+    stride and TMA descriptor error. After the fix, negative strideBatch is clamped
+    to 0 for ragged layouts.
+    """
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
+    if not hasattr(flashinfer.prefill, "trtllm_ragged_attention_deepseek"):
+        pytest.skip("trtllm_ragged_attention_deepseek is not available in this build")
+
+    device = torch.device("cuda")
+    compute_capability = get_compute_capability(device)
+    if compute_capability[0] != 10:
+        pytest.skip(
+            f"TRTLLM-gen ragged attention requires SM100 and SM103 GPUs, got sm{compute_capability[0]}{compute_capability[1]}"
+        )
+
+    torch.manual_seed(42)
+
+    # Configuration that triggers numel > 2^31
+    batch_size = 16
+    max_kv_len = 8192
+    num_kv_heads = 128
+    head_dim_qk = 192
+    head_dim_vo = 128
+
+    # Construct ragged Q
+    seq_lens_q = torch.randint(
+        low=50, high=150, size=(batch_size,), device=device, dtype=torch.int32
+    )
+    cum_seq_lens_q = torch.cat(
+        [
+            torch.zeros(1, device=device, dtype=torch.int32),
+            torch.cumsum(seq_lens_q, dim=0, dtype=torch.int32),
+        ],
+        dim=0,
+    )
+    total_q = int(cum_seq_lens_q[-1].item())
+    max_q_len = int(seq_lens_q.max().item())
+
+    q = torch.randn(
+        total_q,
+        num_kv_heads,
+        head_dim_qk,
+        device=device,
+        dtype=torch.bfloat16,
+    )
+
+    # Construct ragged KV: total_kv = 16 * 8192 = 131072
+    # key.numel() = 131072 * 128 * 192 = 3,221,225,472 (0xC0000000) > 2^31
+    seq_lens_kv = torch.full(
+        (batch_size,), max_kv_len, device=device, dtype=torch.int32
+    )
+    cum_seq_lens_kv = torch.arange(
+        0,
+        (batch_size + 1) * max_kv_len,
+        max_kv_len,
+        device=device,
+        dtype=torch.int32,
+    )
+    total_kv = int(cum_seq_lens_kv[-1].item())
+
+    k = torch.randn(
+        total_kv,
+        num_kv_heads,
+        head_dim_qk,
+        device=device,
+        dtype=torch.bfloat16,
+    )
+    v = torch.randn(
+        total_kv,
+        num_kv_heads,
+        head_dim_vo,
+        device=device,
+        dtype=torch.bfloat16,
+    )
+
+    workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.uint8, device=device)
+    scale = float(1.0 / (head_dim_qk**0.5))
+
+    # Should not raise "buildNdTmaDescriptor: invalid argument" error
+    output = flashinfer.prefill.trtllm_ragged_attention_deepseek(
+        query=q,
+        key=k,
+        value=v,
+        workspace_buffer=workspace_buffer,
+        seq_lens=seq_lens_kv,
+        max_q_len=max_q_len,
+        max_kv_len=max_kv_len,
+        bmm1_scale=scale,
+        bmm2_scale=1.0,
+        o_sf_scale=1.0,
+        batch_size=batch_size,
+        window_left=-1,
+        cum_seq_lens_q=cum_seq_lens_q,
+        cum_seq_lens_kv=cum_seq_lens_kv,
+        enable_pdl=False,
+        is_causal=True,
+        return_lse=False,
+    )
+
+    # Basic shape check
+    assert output.shape[0] == total_q
+    assert output.shape[1] == num_kv_heads
+    assert output.shape[2] == head_dim_vo
diff --git a/tests/attention/test_xqa.py b/tests/attention/test_xqa.py
index a830134fc0..6884998fc8 100644
--- a/tests/attention/test_xqa.py
+++ b/tests/attention/test_xqa.py
@@ -8,7 +8,7 @@
 from flashinfer.utils import get_compute_capability
 
 
-def set_random_seed(seed=42):
+def set_random_seed(seed=0):
     torch.manual_seed(seed)
     torch.cuda.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
@@ -29,39 +29,12 @@ def div_up(a, b):
 sm_count = props.multi_processor_count
 
 beam_width = 1
-q_scale = 1.0
-
-
-class CacheSeq:
-    def __init__(
-        self,
-        pool: torch.Tensor,
-        page_indices: torch.Tensor,
-        nb_heads: int,
-        idx_head: int,
-        tokens_per_page: int = 32,
-    ):
-        self.pool = pool
-        self.page_indices = page_indices
-        self.nb_heads = nb_heads
-        self.idx_head = idx_head
-        self.tokens_per_page = tokens_per_page
-
-    def __getitem__(self, i: int) -> torch.Tensor:
-        page_idx = self.page_indices[i // self.tokens_per_page].to(torch.int32)
-        # VLLM layout (PAGED_KV_CACHE_LAYOUT=1): [page_idx][token_in_page][nb_heads][head_dim]
-        idx_head = (
-            page_idx * self.tokens_per_page * self.nb_heads
-            + (i % self.tokens_per_page) * self.nb_heads
-            + self.idx_head
-        )
-        return self.pool[idx_head]
 
 
 def ref_attention(
     q,
-    k_cache_seq,
-    v_cache_seq,
+    k_cache,
+    v_cache,
     seq_len,
     q_scale,
     kv_scale,
@@ -69,16 +42,22 @@ def ref_attention(
     attention_sinks,
     sliding_win_size,
     valid_elems_per_head,
-    valid_elems_per_v_head=None,  # Optional: for MLA where V dim != K dim
+    valid_elems_per_v_head=None,
 ):
     """
-    For MLA:
-    - Q/K dimension: 576 (valid_elems_per_head)
-    - V dimension: 512 (valid_elems_per_v_head)
-    - Output dimension: matches valid_elems_per_head (576) but only first
-      valid_elems_per_v_head (512) elements are valid
+    Batched reference attention implementation.
+
+    Args:
+        q: [batch_size, nb_k_heads, head_grp_size, valid_elems_per_head]
+        k_cache: [batch_size, nb_k_heads, seq_len, valid_elems_per_head]
+        v_cache: [batch_size, nb_k_heads, seq_len, valid_elems_per_v_head]
+        seq_len: scalar or [batch_size] tensor
+        attention_sinks: [nb_k_heads, head_grp_size] or None
+
+    Returns:
+        out: [batch_size, nb_k_heads, head_grp_size, valid_elems_per_v_head]
     """
-    head_grp_size = q.shape[0]
+    batch_size, nb_k_heads, head_grp_size, _ = q.shape
     rcp_x_scale = 1.0 / x_scale
     qk_scale = q_scale * kv_scale / math.sqrt(valid_elems_per_head)
 
@@ -86,27 +65,16 @@ def ref_attention(
     if valid_elems_per_v_head is None:
         valid_elems_per_v_head = valid_elems_per_head
 
-    q_f32 = q.to(torch.float32)  # [head_grp_size, valid_elems_per_head]
-
-    k_cache_f32 = torch.zeros(
-        seq_len, valid_elems_per_head, dtype=torch.float32, device="cuda"
-    )
-    # V cache: load only valid_elems_per_v_head dimensions
-    v_cache_f32 = torch.zeros(
-        seq_len, valid_elems_per_v_head, dtype=torch.float32, device="cuda"
-    )
-
-    for j in range(seq_len):
-        k_cache_f32[j] = k_cache_seq[j].to(torch.float32)
-        # For MLA: V cache storage is 576 but only first 512 elements are valid
-        v_cache_f32[j] = v_cache_seq[j][:valid_elems_per_v_head].to(torch.float32)
-
-    # q_f32: [head_grp_size, valid_elems_per_head]
-    # k_cache_f32: [seq_len, valid_elems_per_head]
-    # gemm0_acc: [head_grp_size, seq_len]
-    gemm0_acc = torch.zeros(
-        head_grp_size, seq_len, dtype=torch.float32, device=q_f32.device
-    )
+    # Convert to float32 for computation
+    q_f32 = q.to(
+        torch.float32
+    )  # [batch_size, nb_k_heads, head_grp_size, valid_elems_per_head]
+    k_cache_f32 = k_cache[:, :, :seq_len].to(
+        torch.float32
+    )  # [batch_size, nb_k_heads, seq_len, valid_elems_per_head]
+    v_cache_f32 = v_cache[:, :, :seq_len, :valid_elems_per_v_head].to(
+        torch.float32
+    )  # [batch_size, nb_k_heads, seq_len, valid_elems_per_v_head]
 
     # Calculate sliding window start position
     if sliding_win_size == 0 or seq_len < sliding_win_size:
@@ -114,49 +82,38 @@ def ref_attention(
     else:
         seq_beg = seq_len - sliding_win_size
 
-    # Set positions before sliding window to negative infinity (masking)
+    # Q·K^T: [batch_size, nb_k_heads, head_grp_size, seq_len]
+    gemm0_acc = torch.matmul(q_f32, k_cache_f32.transpose(-2, -1)) * qk_scale
+
+    # Apply sliding window mask
     if seq_beg > 0:
-        gemm0_acc[:, :seq_beg] = float("-inf")
+        gemm0_acc[:, :, :, :seq_beg] = float("-inf")
 
-    # q_f32: [head_grp_size, valid_elems_per_head]
-    # k_cache_f32[seq_beg:seq_len]: [valid_seq_len, valid_elems_per_head]
-    if seq_beg < seq_len:
-        valid_k_cache = k_cache_f32[
-            seq_beg:seq_len
-        ]  # [valid_seq_len, valid_elems_per_head]
-        valid_scores = (
-            torch.matmul(q_f32, valid_k_cache.t()) * qk_scale
-        )  # [head_grp_size, valid_seq_len]
-        gemm0_acc[:, seq_beg:seq_len] = valid_scores
+    # Softmax
+    row_max = torch.max(gemm0_acc, dim=-1, keepdim=True)[
+        0
+    ]  # [batch_size, nb_k_heads, head_grp_size, 1]
+    x = torch.exp(
+        gemm0_acc - row_max
+    )  # [batch_size, nb_k_heads, head_grp_size, seq_len]
 
-    row_max = torch.max(gemm0_acc, dim=1, keepdim=True)[0]  # [head_grp_size, 1]
-    x = torch.exp(gemm0_acc - row_max)  # [head_grp_size, seq_len]
+    row_sum = torch.sum(
+        x, dim=-1, keepdim=True
+    )  # [batch_size, nb_k_heads, head_grp_size, 1]
 
-    row_sum = torch.sum(x, dim=1, keepdim=True)  # [head_grp_size, 1]
+    # Add attention sinks contribution
+    if attention_sinks is not None:
+        # attention_sinks: [nb_k_heads, head_grp_size]
+        # row_max: [batch_size, nb_k_heads, head_grp_size, 1]
+        sink_weights = torch.exp(
+            attention_sinks.unsqueeze(0).unsqueeze(-1) - row_max
+        )  # [batch_size, nb_k_heads, head_grp_size, 1]
+        row_sum = row_sum + sink_weights
 
     x = x * rcp_x_scale
 
-    if seq_beg < seq_len:
-        valid_x = x[:, seq_beg:seq_len]  # [head_grp_size, valid_seq_len]
-        valid_v_cache = v_cache_f32[
-            seq_beg:seq_len
-        ]  # [valid_seq_len, valid_elems_per_v_head]
-        out = torch.matmul(
-            valid_x, valid_v_cache
-        )  # [head_grp_size, valid_elems_per_v_head]
-    else:
-        out = torch.zeros(
-            head_grp_size,
-            valid_elems_per_v_head,
-            dtype=torch.float32,
-            device=q_f32.device,
-        )
-
-    if attention_sinks is not None:
-        sink_weights = torch.exp(
-            attention_sinks - row_max.squeeze(-1)
-        )  # [head_grp_size]
-        row_sum.squeeze(-1)[:] += sink_weights
+    # Attention · V: [batch_size, nb_k_heads, head_grp_size, valid_elems_per_v_head]
+    out = torch.matmul(x, v_cache_f32)
 
     out = out * (x_scale * kv_scale) / row_sum
 
@@ -167,16 +124,43 @@ def ref_attention(
     get_compute_capability(torch.device(device="cuda"))[0] not in [9, 10, 12],
     reason="XQA is only supported on SM90, SM100, SM120 GPUs",
 )
+@pytest.mark.parametrize("enable_pdl", [True, False])
 @pytest.mark.parametrize("use_sliding_window", [True, False])
 @pytest.mark.parametrize("input_type", [torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("fp8_kv_cache", [True, False])
 @pytest.mark.parametrize("use_attention_sinks", [True, False])
-@pytest.mark.parametrize("seq_len", [2, 15, 256, 514])
+@pytest.mark.parametrize(
+    "seq_len",
+    [
+        2,
+        15,
+        256,
+        512,
+        pytest.param(
+            514,
+            marks=pytest.mark.xfail(
+                reason="seq_len=514 is known to fail in full test suite occasionally",
+                strict=False,
+            ),
+        ),
+    ],
+)
 @pytest.mark.parametrize("batch_size", [1, 4])
 @pytest.mark.parametrize("nb_k_heads", [2, 4])
 @pytest.mark.parametrize("tokens_per_page", [16, 64])
 @pytest.mark.parametrize("valid_elems_per_head", [32, 128])
 @pytest.mark.parametrize("head_grp_size", [8, 16])
+@pytest.mark.parametrize("kv_layout", ["NHD", "HND"])
+@pytest.mark.parametrize("q_scale", [1.0, 0.5])
+@pytest.mark.parametrize(
+    "fp8_kv_cache,kv_scale,use_fp8_output",
+    [
+        (False, 1.0, False),  # Non-FP8 KV cache: kv_scale=1.0, no FP8 output
+        (True, 1.0, False),  # FP8 KV cache: kv_scale=1.0, no FP8 output
+        (True, 1.0, True),  # FP8 KV cache: kv_scale=1.0, with FP8 output
+        (True, 0.5, False),  # FP8 KV cache: kv_scale=0.5, no FP8 output
+        (True, 0.5, True),  # FP8 KV cache: kv_scale=0.5, with FP8 output
+    ],
+)
 def test_xqa(
     batch_size,
     nb_k_heads,
@@ -188,8 +172,13 @@ def test_xqa(
     head_grp_size,
     use_attention_sinks,
     use_sliding_window,
+    enable_pdl,
+    kv_layout,
+    kv_scale,
+    q_scale,
+    use_fp8_output,
 ):
-    set_random_seed(42)
+    set_random_seed(0)
 
     nb_q_heads = nb_k_heads * head_grp_size
 
@@ -198,7 +187,7 @@ def test_xqa(
         beam_width,
         nb_q_heads,
         valid_elems_per_head,
-        dtype=input_type,
+        dtype=torch.float8_e4m3fn if use_fp8_output else input_type,
         device="cuda",
     )
     output.fill_(float("nan"))
@@ -212,12 +201,12 @@ def test_xqa(
     )
     q_heads.normal_(0, 1)
     if use_attention_sinks:
-        attention_sinks = torch.zeros(
-            nb_k_heads, head_grp_size, dtype=torch.float32, device="cuda"
+        # Vectorized creation of attention_sinks
+        j_indices = torch.arange(head_grp_size, device="cuda")
+        attention_sinks = 2.0 + (j_indices % 4).float()
+        attention_sinks = (
+            attention_sinks.unsqueeze(0).expand(nb_k_heads, head_grp_size).contiguous()
         )
-        for i in range(nb_k_heads):
-            for j in range(head_grp_size):
-                attention_sinks[i, j] = 2.0 + float(j % 4)
     else:
         attention_sinks = None
     if use_sliding_window:
@@ -227,24 +216,48 @@ def test_xqa(
 
     max_seq_len = round_up(seq_len, tokens_per_page)
     nb_pages_per_seq = div_up(max_seq_len, tokens_per_page)
-    # Layout 1: K and V share page indices
-    # Total cache heads = nb_k_heads * max_seq_len * batch_size
-    total_nb_cache_heads = nb_k_heads * max_seq_len * batch_size
+    # Total number of pages needed for all sequences
+    total_num_pages = nb_pages_per_seq * batch_size
+
+    # Create cache with specified layout
+    if kv_layout == "NHD":
+        # NHD layout: [num_pages, page_size, num_kv_heads, head_dim]
+        cache_k_heads = torch.zeros(
+            total_num_pages,
+            tokens_per_page,
+            nb_k_heads,
+            valid_elems_per_head,
+            dtype=input_type,
+            device="cuda",
+        )
+        cache_v_heads = torch.zeros(
+            total_num_pages,
+            tokens_per_page,
+            nb_k_heads,
+            valid_elems_per_head,
+            dtype=input_type,
+            device="cuda",
+        )
+    else:  # HND layout
+        # HND layout: [num_pages, num_kv_heads, page_size, head_dim]
+        cache_k_heads = torch.zeros(
+            total_num_pages,
+            nb_k_heads,
+            tokens_per_page,
+            valid_elems_per_head,
+            dtype=input_type,
+            device="cuda",
+        )
+        cache_v_heads = torch.zeros(
+            total_num_pages,
+            nb_k_heads,
+            tokens_per_page,
+            valid_elems_per_head,
+            dtype=input_type,
+            device="cuda",
+        )
 
-    cache_k_heads = torch.zeros(
-        total_nb_cache_heads,
-        valid_elems_per_head,
-        dtype=input_type,
-        device="cuda",
-    )
     cache_k_heads.normal_(0, 1)
-
-    cache_v_heads = torch.zeros(
-        total_nb_cache_heads,
-        valid_elems_per_head,
-        dtype=input_type,
-        device="cuda",
-    )
     cache_v_heads.normal_(0, 1)
 
     if fp8_kv_cache:
@@ -252,70 +265,72 @@ def test_xqa(
         # and prevent overflow during computation. The factor 4.0 is chosen empirically.
         cache_k_heads /= 4.0
         cache_v_heads /= 4.0
-    page_list_arg = torch.zeros(
-        batch_size, nb_pages_per_seq, dtype=torch.int32, device="cuda"
+    # Vectorized page list initialization
+    total_pages = batch_size * nb_pages_per_seq
+    page_list_arg = torch.arange(total_pages, dtype=torch.int32, device="cuda").view(
+        batch_size, nb_pages_per_seq
     )
 
-    # Initialize page list sequentially
-    page_idx = 0
-    for batch in range(batch_size):
-        for page in range(nb_pages_per_seq):
-            page_list_arg[batch, page] = page_idx
-            page_idx += 1
-
+    # Shuffle page indices
     flattened = page_list_arg.flatten()
-    indices = torch.randperm(flattened.numel())
+    generator = torch.Generator(device="cuda")
+    generator.manual_seed(42)
+    indices = torch.randperm(flattened.numel(), generator=generator, device="cuda")
     shuffled_flat = flattened[indices]
-    page_list_arg = shuffled_flat.view(page_list_arg.shape)
-
-    def cache_head_at(
-        batch,
-        is_k,
-        idx_kv_head,
-        pos,
-        cache_k_heads,
-        cache_v_heads,
-        page_list,
-        beam_width,
-        nb_k_heads,
-        tokens_per_page,
-    ):
-        # Layout 1: K and V share page indices
-        page_idx = page_list[batch][pos // tokens_per_page].to(torch.int32)
-
-        # VLLM layout: [page_idx][token_in_page][nb_heads][head_dim]
-        idx_head = (
-            page_idx * tokens_per_page * nb_k_heads
-            + (pos % tokens_per_page) * nb_k_heads
-            + idx_kv_head
-        )
-
-        return cache_k_heads[idx_head] if is_k else cache_v_heads[idx_head]
-
-    for batch in range(batch_size):
-        for kv in range(2):
-            for idx_kv_head in range(nb_k_heads):
-                for pos in range(seq_len, max_seq_len):
-                    cache_head = cache_head_at(
-                        batch,
-                        kv == 0,
-                        idx_kv_head,
-                        pos,
-                        cache_k_heads,
-                        cache_v_heads,
-                        page_list_arg,
-                        beam_width,
-                        nb_k_heads,
-                        tokens_per_page,
+    page_list_arg = shuffled_flat.view(batch_size, nb_pages_per_seq)
+
+    # Vectorized zeroing of unused cache positions using advanced indexing
+    if seq_len < max_seq_len:
+        # Collect all (page_id, token_pos) pairs that need to be zeroed across all batches
+        start_page = seq_len // tokens_per_page
+        end_page = nb_pages_per_seq
+
+        if start_page < end_page:
+            # Get all page IDs that need partial/full zeroing: [batch_size, num_pages_to_zero]
+            pages_to_zero = page_list_arg[
+                :, start_page:end_page
+            ]  # [batch_size, num_pages_to_zero]
+
+            # For the first page (start_page), zero from [seq_len % tokens_per_page, tokens_per_page)
+            # For subsequent pages, zero entirely [0, tokens_per_page)
+            first_page_ids = pages_to_zero[:, 0]  # [batch_size]
+            token_start_in_first_page = seq_len % tokens_per_page
+
+            if token_start_in_first_page > 0:
+                # Zero partial first page for all batches at once
+                if kv_layout == "NHD":
+                    cache_k_heads[first_page_ids, token_start_in_first_page:, :, :] = (
+                        0.0
+                    )
+                    cache_v_heads[first_page_ids, token_start_in_first_page:, :, :] = (
+                        0.0
                     )
-                    cache_head.fill_(0.0)
+                else:  # HND
+                    cache_k_heads[first_page_ids, :, token_start_in_first_page:, :] = (
+                        0.0
+                    )
+                    cache_v_heads[first_page_ids, :, token_start_in_first_page:, :] = (
+                        0.0
+                    )
+
+            # Zero all subsequent full pages (if any) for all batches at once
+            if pages_to_zero.shape[1] > 1:
+                remaining_page_ids = pages_to_zero[
+                    :, 1:
+                ].flatten()  # Flatten all remaining pages
+                if kv_layout == "NHD":
+                    cache_k_heads[remaining_page_ids, :, :, :] = 0.0
+                    cache_v_heads[remaining_page_ids, :, :, :] = 0.0
+                else:  # HND
+                    cache_k_heads[remaining_page_ids, :, :, :] = 0.0
+                    cache_v_heads[remaining_page_ids, :, :, :] = 0.0
 
     seq_len_list = torch.zeros(
         batch_size, beam_width, dtype=torch.uint32, device="cuda"
     )
     seq_len_list.fill_(seq_len)
 
-    kv_cache_scale = torch.ones(1, dtype=torch.float32, device="cuda")
+    kv_cache_scale = kv_scale
 
     nb_seq = nb_k_heads * batch_size
     nb_semaphores = round_up(nb_seq, 2) + 2 + nb_seq + 2
@@ -325,6 +340,8 @@ def cache_head_at(
     scratch_size = 256 << 20
     scratch_buf = torch.zeros(scratch_size, dtype=torch.uint8, device="cuda")
 
+    rcp_out_scale = 4.0 if use_fp8_output else 1.0
+
     xqa(
         q_heads,
         cache_k_heads.to(torch.float8_e4m3fn) if fp8_kv_cache else cache_k_heads,
@@ -337,76 +354,124 @@ def cache_head_at(
         nb_k_heads,
         tokens_per_page,
         sinks=attention_sinks,
-        q_scale=q_scale,
-        kv_scale=kv_cache_scale,
+        q_scale=torch.tensor(q_scale, device="cuda"),
+        kv_scale=torch.tensor(kv_cache_scale, device="cuda"),
         sliding_win_size=sliding_win_size,
+        kv_layout=kv_layout,
         sm_count=sm_count,
+        enable_pdl=enable_pdl,
+        rcp_out_scale=rcp_out_scale,
+    )
+
+    # Batch reconstruct all K/V caches from paged memory
+    # [batch_size, nb_k_heads, max_seq_len, valid_elems_per_head]
+    num_pages = (seq_len + tokens_per_page - 1) // tokens_per_page
+    batch_k_cache = torch.zeros(
+        batch_size,
+        nb_k_heads,
+        max_seq_len,
+        valid_elems_per_head,
+        dtype=input_type,
+        device="cuda",
+    )
+    batch_v_cache = torch.zeros(
+        batch_size,
+        nb_k_heads,
+        max_seq_len,
+        valid_elems_per_head,
+        dtype=input_type,
+        device="cuda",
     )
 
     for req in range(batch_size):
-        for b in range(beam_width):
-            for idx_k_head in range(nb_k_heads):
-                # Layout 1: K and V use separate pools but share page indices
-                k_cache_seq = CacheSeq(
-                    pool=cache_k_heads,
-                    page_indices=page_list_arg[req],
-                    nb_heads=nb_k_heads,
-                    idx_head=idx_k_head,
-                    tokens_per_page=tokens_per_page,
-                )
-                v_cache_seq = CacheSeq(
-                    pool=cache_v_heads,
-                    page_indices=page_list_arg[req],
-                    nb_heads=nb_k_heads,
-                    idx_head=idx_k_head,
-                    tokens_per_page=tokens_per_page,
-                )
-
-                ref_output = ref_attention(
-                    q=q_heads[req][b][
-                        idx_k_head * head_grp_size : (idx_k_head + 1) * head_grp_size
-                    ],
-                    k_cache_seq=k_cache_seq,
-                    v_cache_seq=v_cache_seq,
-                    seq_len=seq_len,
-                    q_scale=q_scale,
-                    kv_scale=kv_cache_scale[0],
-                    x_scale=1.0,
-                    attention_sinks=attention_sinks[idx_k_head, :]
-                    if use_attention_sinks
-                    else None,
-                    sliding_win_size=sliding_win_size if use_sliding_window else 0,
-                    valid_elems_per_head=valid_elems_per_head,
-                )
-                kernel_output = output[req][b][
-                    idx_k_head * head_grp_size : (idx_k_head + 1) * head_grp_size
-                ].to(torch.float32)
-                if fp8_kv_cache:
-                    atol = 0.05
-                    rtol = 0.05
-                else:
-                    atol = 0.01
-                    rtol = 0.01
-
-                diff_abs = torch.abs(ref_output - kernel_output)
-                diff_rel = diff_abs / (torch.abs(ref_output) + 1e-8)
-
-                within_tolerance = (diff_abs <= atol) | (diff_rel <= rtol)
-
-                pass_ratio = within_tolerance.float().mean().item()
-
-                required_ratio = 0.99
-                assert pass_ratio >= required_ratio, (
-                    f"req={req}, b={b}, idx_k_head={idx_k_head}: "
-                    f"Total {ref_output.numel()} elements, only {pass_ratio:.1%} meet tolerance criteria, "
-                    f"require at least {required_ratio:.1%}"
-                )
+        pages = page_list_arg[req, :num_pages]  # [num_pages]
+        for idx_k_head in range(nb_k_heads):
+            # Gather all pages at once
+            if kv_layout == "NHD":
+                k_pages = cache_k_heads[
+                    pages, :, idx_k_head, :
+                ]  # [num_pages, tokens_per_page, head_dim]
+                v_pages = cache_v_heads[pages, :, idx_k_head, :]
+            else:  # HND
+                k_pages = cache_k_heads[
+                    pages, idx_k_head, :, :
+                ]  # [num_pages, tokens_per_page, head_dim]
+                v_pages = cache_v_heads[pages, idx_k_head, :, :]
+
+            # Reshape to contiguous sequence and store
+            batch_k_cache[req, idx_k_head, : num_pages * tokens_per_page] = (
+                k_pages.reshape(-1, valid_elems_per_head)
+            )
+            batch_v_cache[req, idx_k_head, : num_pages * tokens_per_page] = (
+                v_pages.reshape(-1, valid_elems_per_head)
+            )
+
+    # Reshape q_heads: [batch_size, beam_width, nb_q_heads, dim] -> [batch_size, nb_k_heads, head_grp_size, dim]
+    # Since beam_width = 1, we can squeeze it
+    q_reshaped = q_heads.squeeze(1).reshape(
+        batch_size, nb_k_heads, head_grp_size, valid_elems_per_head
+    )
+
+    # Batch compute reference attention
+    ref_output_batch = ref_attention(
+        q=q_reshaped,
+        k_cache=batch_k_cache,
+        v_cache=batch_v_cache,
+        seq_len=seq_len,
+        q_scale=q_scale,
+        kv_scale=kv_cache_scale,
+        x_scale=1.0,
+        attention_sinks=attention_sinks if use_attention_sinks else None,
+        sliding_win_size=sliding_win_size if use_sliding_window else 0,
+        valid_elems_per_head=valid_elems_per_head,
+    )  # [batch_size, nb_k_heads, head_grp_size, valid_elems_per_head]
+
+    # Reshape kernel output to match: [batch_size, beam_width, nb_q_heads, dim] -> [batch_size, nb_k_heads, head_grp_size, dim]
+    kernel_output_reshaped = (
+        output.squeeze(1)
+        .reshape(batch_size, nb_k_heads, head_grp_size, valid_elems_per_head)
+        .to(torch.float32)
+    )
+
+    if use_fp8_output:
+        ref_output_batch = ref_output_batch * rcp_out_scale
+
+    # Set tolerances
+    if fp8_kv_cache:
+        atol = 0.05
+        rtol = 0.05
+    else:
+        atol = 0.01
+        rtol = 0.01
+    if use_fp8_output:
+        atol = 0.15
+        rtol = 0.15
+
+    # Compute differences for all elements at once
+    diff_abs = torch.abs(ref_output_batch - kernel_output_reshaped)
+    diff_rel = diff_abs / (torch.abs(ref_output_batch) + 1e-8)
+    within_tolerance = (diff_abs <= atol) | (diff_rel <= rtol)
+
+    # One-shot validation for all elements
+    total_elements = ref_output_batch.numel()
+    passing_elements = within_tolerance.sum().item()
+    pass_ratio = passing_elements / total_elements
+    required_ratio = 0.99
+
+    assert pass_ratio >= required_ratio, (
+        f"Batch validation failed: "
+        f"Total {total_elements} elements, only {passing_elements} ({pass_ratio:.1%}) meet tolerance criteria, "
+        f"require at least {required_ratio:.1%}"
+    )
 
 
 @pytest.mark.skipif(
     get_compute_capability(torch.device(device="cuda"))[0] not in [12],
     reason="XQA mla is only supported on SM120 GPUs",
 )
+@pytest.mark.parametrize("kv_scale", [1.0, 0.5])
+@pytest.mark.parametrize("q_scale", [1.0, 0.5])
+@pytest.mark.parametrize("enable_pdl", [True, False])
 @pytest.mark.parametrize("seq_len", [2, 15, 256, 514, 2048])
 @pytest.mark.parametrize("batch_size", [1, 2])
 @pytest.mark.parametrize("tokens_per_page", [32, 64])
@@ -414,8 +479,11 @@ def test_xqa_mla(
     batch_size,
     seq_len,
     tokens_per_page,
+    kv_scale,
+    q_scale,
+    enable_pdl,
 ):
-    set_random_seed(42)
+    set_random_seed(0)
 
     # MLA specific constants (fixed, not parameterized)
     nb_k_heads = 1  # MLA only supports 1 K head
@@ -446,12 +514,14 @@ def test_xqa_mla(
 
     max_seq_len = round_up(seq_len, tokens_per_page)
     nb_pages_per_seq = div_up(max_seq_len, tokens_per_page)
-    # Layout 1: K and V share page indices
-    # Total cache heads = nb_k_heads * max_seq_len * batch_size
-    total_nb_cache_heads = nb_k_heads * max_seq_len * batch_size
+    # Total number of pages needed for all sequences
+    total_num_pages = nb_pages_per_seq * batch_size
 
+    # NHD layout: [num_pages, page_size, num_kv_heads, head_dim]
     cache_k_heads = torch.zeros(
-        total_nb_cache_heads,
+        total_num_pages,
+        tokens_per_page,
+        nb_k_heads,
         valid_elems_per_head_qk,  # K dimension is 576
         dtype=torch.float32,
         device="cuda",
@@ -459,7 +529,9 @@ def test_xqa_mla(
     cache_k_heads.normal_(0, 1)
 
     cache_v_heads = torch.zeros(
-        total_nb_cache_heads,
+        total_num_pages,
+        tokens_per_page,
+        nb_k_heads,
         valid_elems_per_head_qk,  # V storage is 576 (but only 512 used)
         dtype=torch.float32,
         device="cuda",
@@ -469,70 +541,48 @@ def test_xqa_mla(
     cache_k_heads /= 4.0
     cache_v_heads /= 4.0
 
-    page_list_arg = torch.zeros(
-        batch_size, nb_pages_per_seq, dtype=torch.int32, device="cuda"
+    # Vectorized page list initialization
+    total_pages = batch_size * nb_pages_per_seq
+    page_list_arg = torch.arange(total_pages, dtype=torch.int32, device="cuda").view(
+        batch_size, nb_pages_per_seq
     )
 
-    # Initialize page list sequentially
-    page_idx = 0
-    for batch in range(batch_size):
-        for page in range(nb_pages_per_seq):
-            page_list_arg[batch, page] = page_idx
-            page_idx += 1
-
+    # Shuffle page indices
     flattened = page_list_arg.flatten()
-    indices = torch.randperm(flattened.numel())
+    indices = torch.randperm(flattened.numel(), device="cuda")
     shuffled_flat = flattened[indices]
-    page_list_arg = shuffled_flat.view(page_list_arg.shape)
-
-    def cache_head_at(
-        batch,
-        is_k,
-        idx_kv_head,
-        pos,
-        cache_k_heads,
-        cache_v_heads,
-        page_list,
-        beam_width,
-        nb_k_heads,
-        tokens_per_page,
-    ):
-        # Layout 1: K and V share page indices
-        page_idx = page_list[batch][pos // tokens_per_page].to(torch.int32)
-
-        # VLLM layout: [page_idx][token_in_page][nb_heads][head_dim]
-        idx_head = (
-            page_idx * tokens_per_page * nb_k_heads
-            + (pos % tokens_per_page) * nb_k_heads
-            + idx_kv_head
-        )
+    page_list_arg = shuffled_flat.view(batch_size, nb_pages_per_seq)
 
-        return cache_k_heads[idx_head] if is_k else cache_v_heads[idx_head]
-
-    for batch in range(batch_size):
-        for kv in range(2):
-            for idx_kv_head in range(nb_k_heads):
-                for pos in range(seq_len, max_seq_len):
-                    cache_head = cache_head_at(
-                        batch,
-                        kv == 0,
-                        idx_kv_head,
-                        pos,
-                        cache_k_heads,
-                        cache_v_heads,
-                        page_list_arg,
-                        beam_width,
-                        nb_k_heads,
-                        tokens_per_page,
-                    )
-                    cache_head.fill_(0.0)
+    # Vectorized zeroing of unused cache positions (NHD layout only for MLA)
+    if seq_len < max_seq_len:
+        start_page = seq_len // tokens_per_page
+        end_page = nb_pages_per_seq
+
+        if start_page < end_page:
+            pages_to_zero = page_list_arg[
+                :, start_page:end_page
+            ]  # [batch_size, num_pages_to_zero]
+
+            first_page_ids = pages_to_zero[:, 0]  # [batch_size]
+            token_start_in_first_page = seq_len % tokens_per_page
+
+            if token_start_in_first_page > 0:
+                # Zero partial first page for all batches at once (NHD layout)
+                cache_k_heads[first_page_ids, token_start_in_first_page:, :, :] = 0.0
+                cache_v_heads[first_page_ids, token_start_in_first_page:, :, :] = 0.0
+
+            # Zero all subsequent full pages (if any) for all batches at once
+            if pages_to_zero.shape[1] > 1:
+                remaining_page_ids = pages_to_zero[:, 1:].flatten()
+                cache_k_heads[remaining_page_ids, :, :, :] = 0.0
+                cache_v_heads[remaining_page_ids, :, :, :] = 0.0
 
     seq_len_list = torch.zeros(
         batch_size, beam_width, dtype=torch.uint32, device="cuda"
     )
     seq_len_list.fill_(seq_len)
 
-    kv_cache_scale = torch.ones(1, dtype=torch.float32, device="cuda")
+    kv_cache_scale = kv_scale
 
     nb_seq = nb_k_heads * batch_size
     nb_semaphores = round_up(nb_seq, 2) + 2 + nb_seq + 2
@@ -555,58 +605,91 @@ def cache_head_at(
         q_scale=q_scale,
         kv_scale=kv_cache_scale,
         sm_count=sm_count,
+        enable_pdl=enable_pdl,
+    )
+
+    # Batch reconstruct all K/V caches from paged memory
+    # [batch_size, nb_k_heads, max_seq_len, valid_elems_per_head_qk]
+    num_pages = (seq_len + tokens_per_page - 1) // tokens_per_page
+    batch_k_cache = torch.zeros(
+        batch_size,
+        nb_k_heads,
+        max_seq_len,
+        valid_elems_per_head_qk,
+        dtype=torch.float32,
+        device="cuda",
+    )
+    batch_v_cache = torch.zeros(
+        batch_size,
+        nb_k_heads,
+        max_seq_len,
+        valid_elems_per_head_qk,
+        dtype=torch.float32,
+        device="cuda",
     )
 
     for req in range(batch_size):
-        for b in range(beam_width):
-            for idx_k_head in range(nb_k_heads):
-                # Layout 1: K and V use separate pools but share page indices
-                k_cache_seq = CacheSeq(
-                    pool=cache_k_heads,
-                    page_indices=page_list_arg[req],
-                    nb_heads=nb_k_heads,
-                    idx_head=idx_k_head,
-                    tokens_per_page=tokens_per_page,
-                )
-                v_cache_seq = CacheSeq(
-                    pool=cache_v_heads,
-                    page_indices=page_list_arg[req],
-                    nb_heads=nb_k_heads,
-                    idx_head=idx_k_head,
-                    tokens_per_page=tokens_per_page,
-                )
-
-                ref_output = ref_attention(
-                    q=q_heads[req][b][
-                        idx_k_head * head_grp_size : (idx_k_head + 1) * head_grp_size
-                    ],
-                    k_cache_seq=k_cache_seq,
-                    v_cache_seq=v_cache_seq,
-                    seq_len=seq_len,
-                    q_scale=q_scale * math.sqrt(576),
-                    kv_scale=kv_cache_scale[0],
-                    x_scale=1.0,
-                    attention_sinks=None,
-                    sliding_win_size=0,
-                    valid_elems_per_head=valid_elems_per_head_qk,  # Q/K dimension (576)
-                    valid_elems_per_v_head=valid_elems_per_head_v,  # V dimension (512)
-                ).to(torch.float32)
-                kernel_output = output[req][b][
-                    idx_k_head * head_grp_size : (idx_k_head + 1) * head_grp_size
-                ].to(torch.float32)
-                atol = 0.05
-                rtol = 0.05
-
-                diff_abs = torch.abs(ref_output - kernel_output)
-                diff_rel = diff_abs / (torch.abs(ref_output) + 1e-8)
-
-                within_tolerance = (diff_abs <= atol) | (diff_rel <= rtol)
-
-                pass_ratio = within_tolerance.float().mean().item()
-
-                required_ratio = 0.95
-                assert pass_ratio >= required_ratio, (
-                    f"req={req}, b={b}, idx_k_head={idx_k_head}: "
-                    f"Total {ref_output.numel()} elements, only {pass_ratio:.1%} meet tolerance criteria, "
-                    f"require at least {required_ratio:.1%}"
-                )
+        pages = page_list_arg[req, :num_pages]  # [num_pages]
+        for idx_k_head in range(nb_k_heads):
+            # NHD layout: [num_pages, tokens_per_page, nb_k_heads, head_dim]
+            k_pages = cache_k_heads[
+                pages, :, idx_k_head, :
+            ]  # [num_pages, tokens_per_page, head_dim]
+            v_pages = cache_v_heads[pages, :, idx_k_head, :]
+
+            # Reshape to contiguous sequence and store
+            batch_k_cache[req, idx_k_head, : num_pages * tokens_per_page] = (
+                k_pages.reshape(-1, valid_elems_per_head_qk)
+            )
+            batch_v_cache[req, idx_k_head, : num_pages * tokens_per_page] = (
+                v_pages.reshape(-1, valid_elems_per_head_qk)
+            )
+
+    # Reshape q_heads: [batch_size, beam_width, nb_q_heads, dim] -> [batch_size, nb_k_heads, head_grp_size, dim]
+    # Since beam_width = 1, we can squeeze it
+    q_reshaped = q_heads.squeeze(1).reshape(
+        batch_size, nb_k_heads, head_grp_size, valid_elems_per_head_qk
+    )
+
+    # Batch compute reference attention
+    ref_output_batch = ref_attention(
+        q=q_reshaped,
+        k_cache=batch_k_cache,
+        v_cache=batch_v_cache,
+        seq_len=seq_len,
+        q_scale=q_scale * math.sqrt(576),
+        kv_scale=kv_cache_scale,
+        x_scale=1.0,
+        attention_sinks=None,
+        sliding_win_size=0,
+        valid_elems_per_head=valid_elems_per_head_qk,  # Q/K dimension (576)
+        valid_elems_per_v_head=valid_elems_per_head_v,  # V dimension (512)
+    )  # [batch_size, nb_k_heads, head_grp_size, valid_elems_per_v_head]
+
+    # Reshape kernel output to match: [batch_size, beam_width, nb_q_heads, valid_elems_per_v_head] -> [batch_size, nb_k_heads, head_grp_size, valid_elems_per_v_head]
+    kernel_output_reshaped = (
+        output.squeeze(1)
+        .reshape(batch_size, nb_k_heads, head_grp_size, valid_elems_per_head_v)
+        .to(torch.float32)
+    )
+
+    # Set tolerances
+    atol = 0.05
+    rtol = 0.05
+
+    # Compute differences for all elements at once
+    diff_abs = torch.abs(ref_output_batch - kernel_output_reshaped)
+    diff_rel = diff_abs / (torch.abs(ref_output_batch) + 1e-8)
+    within_tolerance = (diff_abs <= atol) | (diff_rel <= rtol)
+
+    # One-shot validation for all elements
+    total_elements = ref_output_batch.numel()
+    passing_elements = within_tolerance.sum().item()
+    pass_ratio = passing_elements / total_elements
+    required_ratio = 0.95
+
+    assert pass_ratio >= required_ratio, (
+        f"Batch validation failed: "
+        f"Total {total_elements} elements, only {passing_elements} ({pass_ratio:.1%}) meet tolerance criteria, "
+        f"require at least {required_ratio:.1%}"
+    )
diff --git a/tests/attention/test_xqa_batch_decode.py b/tests/attention/test_xqa_batch_decode.py
new file mode 100644
index 0000000000..6a06575ad6
--- /dev/null
+++ b/tests/attention/test_xqa_batch_decode.py
@@ -0,0 +1,558 @@
+import pytest
+import torch
+from tests.test_helpers.sink_attention_reference import sink_attention_unified
+
+import flashinfer
+from flashinfer.utils import get_compute_capability
+
+DTYPE_MAP = {
+    "fp16": torch.float16,
+    "bf16": torch.bfloat16,
+    "fp8": torch.float8_e4m3fn,
+}
+
+GPU_DEVICE = "cuda:0"
+
+global_workspace_buffer = None  # can be empty initialized
+global_xqa_workspace_buffer = None  # must be zero initialized
+workspace_size = 256 * 1024 * 1024
+
+
+def to_float8(x, dtype=torch.float8_e4m3fn):
+    finfo = torch.finfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    scale = finfo.max / amax * 0.1
+    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    return x_scl_sat.to(dtype), scale.float().reciprocal()
+
+
+def generate_seq_lens_decode(batch_size, q_len_per_req, max_in_kv_len):
+    q_lens = torch.full((batch_size,), q_len_per_req, dtype=torch.int32)
+    in_kv_lens = torch.randint(0, max_in_kv_len + 1, (batch_size,), dtype=torch.int)
+    in_kv_lens[-1] = max_in_kv_len
+    seq_lens = q_lens + in_kv_lens
+    return q_lens, in_kv_lens, seq_lens
+
+
+def generate_cumsum_lens(lens):
+    return torch.cat(
+        [
+            torch.tensor([0], dtype=torch.int32, device=GPU_DEVICE),
+            torch.cumsum(lens.to(GPU_DEVICE), dim=0, dtype=torch.int32),
+        ]
+    )
+
+
+def create_query_tensor(q_lens, num_qo_heads, head_dim, q_dtype):
+    q = torch.randn(
+        torch.sum(q_lens).item(),
+        num_qo_heads,
+        head_dim,
+        dtype=torch.bfloat16 if q_dtype == "fp8" else DTYPE_MAP[q_dtype],
+        device=GPU_DEVICE,
+    )
+    if q_dtype == "fp8":
+        q, q_scale = to_float8(q)
+        # Reference implementation have functional issue or low precision with fp8, use bfloat16 and fake-quantization instead.
+        ref_q = q.bfloat16() * q_scale
+    else:
+        q_scale = 1.0
+        ref_q = q
+
+    return q, q_scale, ref_q
+
+
+def create_kv_cache(
+    batch_size,
+    seq_lens,
+    page_size,
+    num_kv_heads,
+    head_dim,
+    kv_dtype,
+    ref_kv_dtype,
+    kv_layout="NHD",
+):
+    # Create separate K and V caches with specified layout (NHD or HND)
+    max_seq_len = torch.max(seq_lens).item()
+    num_pages_per_seq = (max_seq_len + page_size - 1) // page_size
+    num_pages = num_pages_per_seq * batch_size
+    ref_kv_dtype_torch = DTYPE_MAP[ref_kv_dtype]
+    if kv_dtype != "fp8":
+        assert kv_dtype == ref_kv_dtype, (
+            "kv_dtype and ref_kv_dtype must be the same for non-fp8 kv_cache"
+        )
+
+    # Create cache with specified layout
+    if kv_layout == "NHD":
+        # NHD layout: [num_pages, page_size, num_kv_heads, head_dim]
+        k_cache = torch.randn(
+            num_pages,
+            page_size,
+            num_kv_heads,
+            head_dim,
+            dtype=ref_kv_dtype_torch,
+            device=GPU_DEVICE,
+        )
+        v_cache = torch.randn(
+            num_pages,
+            page_size,
+            num_kv_heads,
+            head_dim,
+            dtype=ref_kv_dtype_torch,
+            device=GPU_DEVICE,
+        )
+    else:  # HND layout
+        # HND layout: [num_pages, num_kv_heads, page_size, head_dim]
+        k_cache = torch.randn(
+            num_pages,
+            num_kv_heads,
+            page_size,
+            head_dim,
+            dtype=ref_kv_dtype_torch,
+            device=GPU_DEVICE,
+        )
+        v_cache = torch.randn(
+            num_pages,
+            num_kv_heads,
+            page_size,
+            head_dim,
+            dtype=ref_kv_dtype_torch,
+            device=GPU_DEVICE,
+        )
+
+    # Convert K and V separately to fp8 if needed
+    if kv_dtype == "fp8":
+        k_cache, k_scale = to_float8(k_cache / 4.0)
+        v_cache, v_scale = to_float8(v_cache / 4.0)
+        # use high precision and fake-quantization for reference to avoid precision/functional issue
+        ref_kv_cache = torch.stack(
+            [
+                k_cache.to(ref_kv_dtype_torch) * k_scale,
+                v_cache.to(ref_kv_dtype_torch) * v_scale,
+            ],
+            dim=1,
+        )
+    else:
+        k_scale = v_scale = 1.0
+        ref_kv_cache = torch.stack([k_cache, v_cache], dim=1)
+    # Combine K and V into interleaved format for the API
+    kv_cache = torch.stack([k_cache, v_cache], dim=1)
+
+    return kv_cache, k_scale, v_scale, ref_kv_cache
+
+
+def create_page_table(batch_size, seq_lens, page_size):
+    # Ensure seq_lens is on GPU and calculate page_per_seq on GPU
+    seq_lens = seq_lens.to(GPU_DEVICE)
+    page_per_seq = (seq_lens + page_size - 1) // page_size
+    max_num_pages_per_seq = torch.max(page_per_seq).item()
+
+    # Generate sequential page IDs
+    total_pages_needed = torch.sum(page_per_seq).item()
+    all_page_ids = torch.arange(
+        total_pages_needed, dtype=torch.int32, device=GPU_DEVICE
+    )
+
+    # Use cumsum to create page offsets for each sequence
+    page_offsets = torch.cat(
+        [
+            torch.tensor([0], device=GPU_DEVICE, dtype=torch.int32),
+            torch.cumsum(page_per_seq[:-1], dim=0, dtype=torch.int32),
+        ]
+    )
+
+    # Create page tables using broadcasting
+    page_idx_range = torch.arange(
+        max_num_pages_per_seq, device=GPU_DEVICE, dtype=torch.int32
+    ).unsqueeze(0)
+    page_tables = (
+        page_offsets.unsqueeze(1) + page_idx_range
+    )  # [batch_size, max_num_pages_per_seq]
+
+    return page_tables, all_page_ids, page_per_seq
+
+
+def flatten_paged_kv(
+    ref_kv_cache: torch.Tensor,
+    page_table: torch.Tensor,
+    seq_lens: torch.Tensor,
+    page_size: int,
+    kv_last_page_len: torch.Tensor,
+    kv_layout: str = "NHD",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Build flat K/V and token-level indptr from paged KV cache and page table.
+
+    Supports both NHD and HND layouts.
+    Optimized to avoid loops using vectorized operations.
+    """
+    device = ref_kv_cache.device
+    batch_size = int(page_table.shape[0])
+
+    # Calculate number of pages per sequence
+    page_per_seq = (seq_lens + page_size - 1) // page_size
+    max_pages = int(page_per_seq.max().item())
+
+    # Gather all pages at once using advanced indexing
+    # page_table shape: [batch_size, max_pages]
+    if kv_layout == "NHD":
+        # ref_kv_cache: [num_pages_total, 2, page_size, num_heads, head_dim]
+        # Gather: [batch_size, max_pages, page_size, num_heads, head_dim]
+        k_pages = ref_kv_cache[
+            page_table, 0
+        ]  # [batch_size, max_pages, page_size, num_heads, head_dim]
+        v_pages = ref_kv_cache[page_table, 1]
+    else:  # HND
+        # ref_kv_cache: [num_pages_total, 2, num_heads, page_size, head_dim]
+        # Gather: [batch_size, max_pages, num_heads, page_size, head_dim]
+        k_pages = ref_kv_cache[
+            page_table, 0
+        ]  # [batch_size, max_pages, num_heads, page_size, head_dim]
+        v_pages = ref_kv_cache[page_table, 1]
+        # Transpose to NHD: [batch_size, max_pages, num_heads, page_size, head_dim] -> [batch_size, max_pages, page_size, num_heads, head_dim]
+        k_pages = k_pages.transpose(2, 3)
+        v_pages = v_pages.transpose(2, 3)
+
+    # Reshape to [batch_size, max_pages * page_size, num_heads, head_dim]
+    num_heads = k_pages.shape[-2]
+    head_dim = k_pages.shape[-1]
+    k_pages = k_pages.reshape(batch_size, max_pages * page_size, num_heads, head_dim)
+    v_pages = v_pages.reshape(batch_size, max_pages * page_size, num_heads, head_dim)
+
+    # Create token indices for each sequence using vectorized operations
+    # For each batch, we need to extract [:seq_len] tokens
+    max_seq_len = seq_lens.max().item()
+    token_idx = torch.arange(max_seq_len, device=device, dtype=torch.int32).unsqueeze(
+        0
+    )  # [1, max_seq_len]
+    token_mask = token_idx < seq_lens.unsqueeze(1)  # [batch_size, max_seq_len]
+
+    # Gather valid tokens for all sequences at once
+    # Expand k_pages and v_pages to max_seq_len, then mask
+    k_gathered = k_pages[
+        :, :max_seq_len, :, :
+    ]  # [batch_size, max_seq_len, num_heads, head_dim]
+    v_gathered = v_pages[
+        :, :max_seq_len, :, :
+    ]  # [batch_size, max_seq_len, num_heads, head_dim]
+
+    # Flatten and filter by mask
+    k_gathered_flat = k_gathered.reshape(
+        -1, num_heads, head_dim
+    )  # [batch_size * max_seq_len, num_heads, head_dim]
+    v_gathered_flat = v_gathered.reshape(-1, num_heads, head_dim)
+    token_mask_flat = token_mask.reshape(-1)  # [batch_size * max_seq_len]
+
+    # Keep only valid tokens
+    k_flat = k_gathered_flat[token_mask_flat]
+    v_flat = v_gathered_flat[token_mask_flat]
+
+    kv_indptr_tokens = torch.cat(
+        [
+            torch.tensor([0], dtype=torch.int32, device=device),
+            torch.cumsum(seq_lens, dim=0, dtype=torch.int32),
+        ]
+    )
+    return k_flat, v_flat, kv_indptr_tokens
+
+
+def create_workspace_buffers(device):
+    # Lazily initialize and reuse global workspace buffers
+    global global_workspace_buffer, global_xqa_workspace_buffer
+    if global_workspace_buffer is None:
+        global_workspace_buffer = torch.empty(
+            workspace_size, dtype=torch.int8, device=device
+        )
+    if global_xqa_workspace_buffer is None:
+        global_xqa_workspace_buffer = torch.zeros(
+            workspace_size, dtype=torch.int8, device=device
+        )
+    return global_xqa_workspace_buffer, global_workspace_buffer
+
+
+def create_output(q, o_dtype):
+    """Create output tensor for the given query and output dtype."""
+    if o_dtype == "fp8":
+        o_scale = torch.rand(1).item() * 0.5 + 0.5  # Scale range: 0.5 ~ 1.0
+        out = torch.empty(q.shape, dtype=torch.float8_e4m3fn, device=q.device)
+    else:
+        o_scale = 1.0
+        out = torch.empty(q.shape, dtype=DTYPE_MAP[o_dtype], device=q.device)
+
+    return out, o_scale
+
+
+def get_last_page_len(seq_lens, page_size):
+    """Get the valid token count in the last page for each sequence"""
+    last_page_len = seq_lens % page_size
+    # If the sequence length is a multiple of page_size, the last page is full
+    last_page_len = torch.where(last_page_len == 0, page_size, last_page_len)
+    return last_page_len
+
+
+def generate_causal_mask(
+    batch_size: int,
+    q_seq_len: int,
+    device: torch.device,
+) -> torch.Tensor:
+    """
+    Generate causal attention mask for speculative decoding.
+
+    Parameters
+    ----------
+    batch_size : int
+        Batch size
+    q_seq_len : int
+        Query sequence length (number of speculative decoding tokens)
+    device : torch.device
+        Target device for the mask tensor
+
+    Returns
+    -------
+    torch.Tensor
+        Causal mask with shape [batch_size, q_seq_len, mask_size_per_row]
+        where mask_size_per_row = divUp(q_seq_len, 32) * 2 (in uint16_t units).
+        Data type: torch.uint16
+
+    """
+    num_packed_masks_per_token = (q_seq_len + 31) // 32
+
+    q_indices = torch.arange(q_seq_len, device=device, dtype=torch.int32).unsqueeze(1)
+    kv_indices = torch.arange(q_seq_len, device=device, dtype=torch.int32).unsqueeze(0)
+
+    causal_bool_mask = kv_indices <= q_indices
+
+    padded_seq_len = num_packed_masks_per_token * 32
+    if padded_seq_len > q_seq_len:
+        padding = torch.zeros(
+            q_seq_len, padded_seq_len - q_seq_len, device=device, dtype=torch.bool
+        )
+        causal_bool_mask = torch.cat([causal_bool_mask, padding], dim=1)
+
+    causal_bool_mask = causal_bool_mask.view(q_seq_len, num_packed_masks_per_token, 32)
+
+    bit_positions = torch.tensor(
+        [1 << i for i in range(32)], device=device, dtype=torch.int64
+    )
+
+    mask_uint32 = (
+        (causal_bool_mask.to(torch.int64) * bit_positions).sum(dim=-1).to(torch.uint32)
+    )
+
+    mask_uint32 = (
+        mask_uint32.unsqueeze(0)
+        .expand(batch_size, q_seq_len, num_packed_masks_per_token)
+        .contiguous()
+    )
+
+    mask_uint16 = mask_uint32.view(torch.uint16)
+
+    return mask_uint16
+
+
+@pytest.mark.skipif(
+    get_compute_capability(torch.device(device="cuda"))[0] not in [9, 10, 12],
+    reason="XQA is only supported on SM90, SM100, SM120 GPUs",
+)
+@pytest.mark.parametrize(
+    "batch_size,q_len_per_req,page_size,num_kv_heads,head_grp_size",
+    [
+        (4, 4, 64, 4, 2),
+        (4, 2, 16, 2, 4),
+        (4, 3, 32, 2, 6),
+        (4, 1, 16, 2, 1),
+        (4, 1, 32, 2, 5),
+        (128, 1, 64, 2, 6),
+        (256, 1, 64, 4, 8),
+    ],
+)
+@pytest.mark.parametrize("window_left", [-1, 127])
+@pytest.mark.parametrize(
+    "q_dtype,kv_dtype,o_dtype",
+    [
+        ("bf16", "bf16", "bf16"),
+        ("fp16", "fp16", "fp16"),
+        ("bf16", "fp8", "bf16"),
+        ("fp16", "fp8", "fp16"),
+        ("bf16", "fp8", "fp8"),
+        ("fp16", "fp8", "fp8"),
+    ],
+)
+@pytest.mark.parametrize("enable_pdl", [True, False, None])
+@pytest.mark.parametrize("enable_sink", [True, False])
+@pytest.mark.parametrize("max_in_kv_len", [110])
+@pytest.mark.parametrize("kv_layout", ["NHD", "HND"])
+def test_xqa_batch_decode(
+    batch_size,
+    q_len_per_req,
+    page_size,
+    num_kv_heads,
+    head_grp_size,
+    window_left,
+    q_dtype,
+    o_dtype,
+    kv_dtype,
+    enable_pdl,
+    enable_sink,
+    max_in_kv_len,
+    kv_layout,
+):
+    """Test xqa_batch_decode_with_kv_cache function.
+
+    This test supports both NHD and HND layouts.
+    """
+
+    # Set up test parameters
+    torch.manual_seed(0)
+    head_dim = 128
+
+    # Generate random sequence lengths
+    num_qo_heads = num_kv_heads * head_grp_size
+    q_lens, in_kv_lens, seq_lens = generate_seq_lens_decode(
+        batch_size, q_len_per_req, max_in_kv_len
+    )
+
+    # Create query tensor and related data
+    q, q_scale, ref_q = create_query_tensor(q_lens, num_qo_heads, head_dim, q_dtype)
+    q_indptr = generate_cumsum_lens(q_lens)
+
+    # Create KV cache and related data
+    kv_cache, k_scale, v_scale, ref_kv_cache = create_kv_cache(
+        batch_size,
+        seq_lens,
+        page_size,
+        num_kv_heads,
+        head_dim,
+        kv_dtype,
+        "bf16" if q_dtype == "fp8" else q_dtype,
+        kv_layout,
+    )
+    page_table, all_page_ids, page_per_seq = create_page_table(
+        batch_size, seq_lens, page_size
+    )
+    kv_indptr = generate_cumsum_lens(page_per_seq)
+    kv_last_page_len = get_last_page_len(seq_lens, page_size)
+
+    workspace_buffer, workspace_buffer_ref = create_workspace_buffers(GPU_DEVICE)
+
+    # Create output tensor and related data
+    out, o_scale = create_output(q, o_dtype)
+
+    sm_scale = float(1.0 / (head_dim**0.5))
+
+    # Build reference output
+    plan_params = {
+        "indptr": kv_indptr,
+        "indices": all_page_ids,
+        "last_page_len": kv_last_page_len.to(GPU_DEVICE),
+        "num_qo_heads": num_qo_heads,
+        "num_kv_heads": num_kv_heads,
+        "head_dim": head_dim,
+        "page_size": page_size,
+        "pos_encoding_mode": "NONE",
+        "kv_data_type": ref_kv_cache.dtype,
+        "q_data_type": ref_q.dtype,
+        "window_left": window_left,
+    }
+    if not enable_sink:
+        if q_len_per_req == 1:
+            wrapper_ref = flashinfer.decode.BatchDecodeWithPagedKVCacheWrapper(
+                workspace_buffer_ref, kv_layout, use_tensor_cores=True
+            )
+            wrapper_ref.plan(**plan_params)
+            output_ref = wrapper_ref.run(ref_q, ref_kv_cache)
+        else:
+            # speculative decoding test
+            wrapper_ref = flashinfer.prefill.BatchPrefillWithPagedKVCacheWrapper(
+                workspace_buffer_ref, kv_layout
+            )
+            plan_params_prefill = plan_params.copy()
+            plan_params_prefill.update(
+                {
+                    "qo_indptr": q_indptr,
+                    "paged_kv_indptr": plan_params_prefill.pop("indptr"),
+                    "paged_kv_indices": plan_params_prefill.pop("indices"),
+                    "paged_kv_last_page_len": plan_params_prefill.pop("last_page_len"),
+                    "head_dim_qk": plan_params_prefill.pop("head_dim"),
+                    "causal": True,
+                    "logits_soft_cap": 0.0,
+                }
+            )
+            wrapper_ref.plan(**plan_params_prefill)
+            output_ref = wrapper_ref.run(ref_q, ref_kv_cache)
+    else:
+        # Construct flat K/V via helper
+        k_flat, v_flat, kv_indptr_tokens = flatten_paged_kv(
+            ref_kv_cache,
+            page_table,
+            seq_lens.to(GPU_DEVICE),
+            page_size,
+            kv_last_page_len,
+            kv_layout,
+        )
+        sink = torch.rand(num_qo_heads, device=GPU_DEVICE, dtype=torch.float32) * 5
+        output_ref = sink_attention_unified(
+            ref_q,
+            k_flat,
+            v_flat,
+            sink,
+            window_left,
+            True,
+            sm_scale,
+            mode="varlen",
+            batch_size=batch_size,
+            qo_indptr=q_indptr,
+            kv_indptr=kv_indptr_tokens,
+        )
+
+    if q_len_per_req > 1:
+        mask = generate_causal_mask(batch_size, q_len_per_req, GPU_DEVICE)
+    else:
+        mask = None
+
+    # Run xqa_batch_decode_with_kv_cache function
+    output = flashinfer.decode.xqa_batch_decode_with_kv_cache(
+        q.contiguous(),
+        kv_cache,
+        workspace_buffer,
+        page_table,
+        seq_lens.to(GPU_DEVICE),
+        torch.max(seq_lens).item(),
+        q_scale * k_scale * sm_scale,  # bmm1_scale
+        v_scale / o_scale,  # bmm2_scale
+        window_left,  # window_left
+        out=out,
+        enable_pdl=enable_pdl,
+        sinks=(sink if enable_sink else None),
+        kv_layout=kv_layout,
+        q_len_per_req=q_len_per_req,
+        o_scale=o_scale,
+        mask=mask,
+    )
+
+    # Verification
+    torch.testing.assert_close(
+        output.float(),
+        output_ref.float() / o_scale,
+        rtol=1e-1 if kv_dtype == "fp8" else 1e-2,
+        atol=1e-1 if kv_dtype == "fp8" else 1e-2,
+    )
+
+
+if __name__ == "__main__":
+    # Run a simple test case
+    test_xqa_batch_decode(
+        batch_size=4,
+        q_len_per_req=1,
+        page_size=16,
+        num_kv_heads=2,
+        head_grp_size=1,
+        window_left=-1,
+        q_dtype="bf16",
+        kv_dtype="bf16",
+        o_dtype="bf16",
+        enable_pdl=True,
+        enable_sink=True,
+        max_in_kv_len=110,
+        kv_layout="NHD",
+    )
diff --git a/tests/attention/test_xqa_mla_batch_decode.py b/tests/attention/test_xqa_mla_batch_decode.py
new file mode 100644
index 0000000000..aebf77da3d
--- /dev/null
+++ b/tests/attention/test_xqa_mla_batch_decode.py
@@ -0,0 +1,187 @@
+import pytest
+import torch
+
+import flashinfer
+from flashinfer.utils import get_compute_capability
+
+global_workspace_buffer = None  # can.be empty initialized
+global_xqa_workspace_buffer = None  # must be zero initialized
+workspace_size = 128 * 1024 * 1024
+
+
+@pytest.mark.parametrize(
+    "batch_size",
+    [1, 2, 4, 16, 32, 64, 128, 256, 512, 768, 1024],
+)
+@pytest.mark.parametrize("scale", [1.0, 0.5])
+@pytest.mark.parametrize("page_size", [32, 64, 128])
+@pytest.mark.parametrize("enable_pdl", [True, False, None])
+def test_xqa_mla_batch_decode(
+    batch_size: int,
+    scale: float,
+    page_size: int,
+    enable_pdl: bool,
+):
+    compute_capability = get_compute_capability(torch.device(device="cuda"))
+    if compute_capability[0] != 12:
+        pytest.skip("These tests are only guaranteed to work on SM120 GPUs.")
+
+    torch.manual_seed(42)
+    dtype = torch.float8_e4m3fn
+    q_len_per_request = 1
+    device = "cuda:0"
+
+    # Fixed max sequence length
+    max_seq_len = 1024
+
+    # Deepseek attention config (decode-MLA)
+    num_q_heads = 128
+    qk_nope_head_dim = 128
+    qk_rope_head_dim = 64
+    kv_lora_rank = 512
+
+    # Initialize tensors
+    query = torch.randn(
+        batch_size,
+        q_len_per_request,
+        num_q_heads,
+        kv_lora_rank + qk_rope_head_dim,
+        device=device,
+    ).to(dtype)
+
+    num_blocks_per_seq = (max_seq_len + page_size - 1) // page_size
+    num_blocks = num_blocks_per_seq * batch_size
+
+    # Sequence lengths and block tables
+    seq_lens = [torch.randint(1, max_seq_len, (1,)).item() for _ in range(batch_size)]
+    seq_lens[-1] = max_seq_len
+    max_seq_len = max(seq_lens)
+    seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int, device=device)
+
+    blocks_per_seq = (seq_lens_tensor + page_size - 1) // page_size
+    max_num_blocks_per_seq = blocks_per_seq.max().item()
+
+    # Generate random but unique block IDs for all sequences
+    total_blocks_needed = int(blocks_per_seq.sum().item())
+    all_block_ids = torch.randperm(
+        total_blocks_needed, device=device
+    )  # Random permutation
+
+    # Generate unique block IDs for all sequences
+    block_id = 0
+    block_tables = torch.zeros(
+        (batch_size, max_num_blocks_per_seq), dtype=torch.int, device=device
+    )
+
+    # Populate block tables and track block assignments
+    block_id = 0
+    for i in range(batch_size):
+        num_blocks_needed = int(blocks_per_seq[i].item())
+        block_tables[i, :num_blocks_needed] = all_block_ids[
+            block_id : block_id + num_blocks_needed
+        ]
+        block_id += num_blocks_needed
+
+    # Create interleaved KV cache
+    # Allocate more than needed blocks, block_id is just enough, to mimick real-world cases
+    kv_cache = torch.randn(
+        size=(num_blocks, page_size, kv_lora_rank + qk_rope_head_dim), device=device
+    ).to(dtype)
+    # (num_blocks, 1, page_size, kv_lora_rank + qk_rope_head_dim)
+
+    global global_workspace_buffer, global_xqa_workspace_buffer
+    if global_workspace_buffer is None:
+        global_workspace_buffer = torch.empty(
+            workspace_size, dtype=torch.int8, device=device
+        )
+    if global_xqa_workspace_buffer is None:
+        global_xqa_workspace_buffer = torch.zeros(
+            workspace_size, dtype=torch.int8, device=device
+        )
+    workspace_buffer = global_xqa_workspace_buffer
+    workspace_buffer_ref = global_workspace_buffer
+
+    # Run decode-MLA
+    output = flashinfer.decode.xqa_batch_decode_with_kv_cache_mla(
+        query=query,
+        kv_cache=kv_cache.unsqueeze(1),
+        workspace_buffer=workspace_buffer,
+        qk_nope_head_dim=qk_nope_head_dim,
+        kv_lora_rank=kv_lora_rank,
+        qk_rope_head_dim=qk_rope_head_dim,
+        block_tables=block_tables,
+        seq_lens=seq_lens_tensor,
+        max_seq_len=max_seq_len,
+        bmm1_scale=scale / ((128 + 64) ** 0.5),
+        bmm2_scale=1.0,
+        enable_pdl=enable_pdl,
+    )
+
+    # Run reference attention and align output
+    sm_scale = scale / (
+        (128 + 64) ** 0.5
+    )  # use head dimension before matrix absorption
+    wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
+        workspace_buffer_ref,
+        backend="fa2",
+    )
+
+    if dtype == torch.float8_e4m3fn:
+        # convert query and kv_cache to bfloat16
+        query = query.to(torch.bfloat16)
+        kv_cache = kv_cache.to(torch.bfloat16)
+
+    q_indptr = (
+        torch.arange(0, batch_size + 1, device=device, dtype=torch.int32)
+        * q_len_per_request
+    )
+    kv_indptr = torch.zeros_like(q_indptr)
+    kv_indptr[1:] = torch.cumsum(blocks_per_seq, dim=0)
+    kv_indices = all_block_ids.int()
+
+    wrapper.plan(
+        q_indptr,
+        kv_indptr,
+        kv_indices,
+        seq_lens_tensor,
+        num_q_heads,
+        kv_lora_rank,
+        qk_rope_head_dim,
+        page_size,
+        True,
+        sm_scale,
+        query.dtype,
+        kv_cache.dtype,
+    )
+    q_nope = query[..., :kv_lora_rank].view(
+        batch_size * q_len_per_request, num_q_heads, kv_lora_rank
+    )
+    q_pe = query[..., kv_lora_rank:].view(
+        batch_size * q_len_per_request, num_q_heads, qk_rope_head_dim
+    )
+
+    # todo: fix kv_cache
+    ckv = kv_cache[..., :kv_lora_rank]
+    kpe = kv_cache[..., kv_lora_rank:]
+
+    o_ref = wrapper.run(q_nope, q_pe, ckv, kpe, return_lse=False)
+
+    atol = 0.05
+    rtol = 0.05
+
+    diff_abs = torch.abs(
+        o_ref.view(batch_size, q_len_per_request, num_q_heads, -1) - output
+    )
+    diff_rel = diff_abs / (
+        torch.abs(o_ref.view(batch_size, q_len_per_request, num_q_heads, -1)) + 1e-8
+    )
+
+    within_tolerance = (diff_abs <= atol) | (diff_rel <= rtol)
+
+    pass_ratio = within_tolerance.float().mean().item()
+
+    required_ratio = 0.95
+    assert pass_ratio >= required_ratio, (
+        f"Total {o_ref.numel()} elements, only {pass_ratio:.1%} meet tolerance criteria, "
+        f"require at least {required_ratio:.1%}"
+    )
diff --git a/tests/comm/test_trtllm_mnnvl_allreduce.py b/tests/comm/test_trtllm_mnnvl_allreduce.py
index 79830065b6..e7274c46f0 100644
--- a/tests/comm/test_trtllm_mnnvl_allreduce.py
+++ b/tests/comm/test_trtllm_mnnvl_allreduce.py
@@ -1,5 +1,5 @@
 # Check torch version:
-from typing import Tuple
+from typing import Tuple, Optional
 
 import pytest
 import torch
@@ -7,6 +7,7 @@
 
 import flashinfer.comm.trtllm_mnnvl_ar as trtllm_mnnvl_ar
 from flashinfer.comm.mapping import Mapping
+from flashinfer.comm.mnnvl import CommBackend, MpiComm
 
 # Use flashinfer.norm.rmsnorm as reference implementation.
 from flashinfer.norm import rmsnorm
@@ -28,6 +29,7 @@ def row_linear_residual_norm_fusion_forward(
     unicast_ptr: int,
     max_num_elements_mnnvl: int,
     buffer_flags_mnnvl: torch.Tensor,
+    comm_backend_for_handle_transfer: Optional[CommBackend] = None,
 ):
     x = x.cuda()
     residual = residual.cuda()
@@ -36,8 +38,11 @@ def row_linear_residual_norm_fusion_forward(
 
     tensor_parallel_size = mapping.tp_size
     tensor_parallel_rank = mapping.tp_rank
-
-    MPI.COMM_WORLD.barrier()
+    if comm_backend_for_handle_transfer is None:
+        comm = MpiComm()
+    else:
+        comm = comm_backend_for_handle_transfer
+    comm.barrier()
 
     def func(
         input,
@@ -147,25 +152,27 @@ def func(
         )
 
 
-"""Main test function that runs on each MPI rank"""
+"""Helper function to run the core MNNVL AllReduce test logic"""
 
 
-@pytest.mark.parametrize(
-    "seq_lens",
-    [
-        [1],
-        [4],
-        [15],
-        [27, 11, 24],
-        [127],
-    ],
-)  # Test with different sequence length lists
-@pytest.mark.parametrize("fusion", [False, True])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("hidden_size", [2048, 4096, 5120, 7168, 8192])
-def test_mnnvl_allreduce_full(
-    monkeypatch, seq_lens: list[int], fusion: bool, dtype: torch.dtype, hidden_size: int
+def run_mnnvl_ar_full(
+    monkeypatch,
+    seq_lens: list[int],
+    fusion: bool,
+    dtype: torch.dtype,
+    hidden_size: int,
+    explicit_workspace_bytes: int | None = None,
 ):
+    """Core test logic for MNNVL AllReduce operations.
+
+    Args:
+        monkeypatch: pytest monkeypatch fixture
+        seq_lens: List of sequence lengths to test
+        fusion: Whether to test fused allreduce+rmsnorm or just allreduce
+        dtype: Data type for tensors
+        hidden_size: Hidden dimension size
+        explicit_workspace_bytes: If provided, use this workspace size instead of default
+    """
     monkeypatch.setenv("TRTLLM_FORCE_MNNVL_AR", "1")  # force multi-node allreduce.
 
     # Get MPI info
@@ -211,7 +218,9 @@ def test_mnnvl_allreduce_full(
         # This workspace is sized for the maximum expected sequence length and can be reused within each list
         # Each parameterized list gets its own fresh workspace allocation
         mcast_buffer_mnnvl, buffer_flags_mnnvl, max_num_elements_mnnvl = (
-            trtllm_mnnvl_ar.get_allreduce_mnnvl_workspace(mapping, dtype)
+            trtllm_mnnvl_ar.get_allreduce_mnnvl_workspace(
+                mapping, dtype, buffer_size_in_bytes=explicit_workspace_bytes
+            )
         )
 
         multicast_ptr = mcast_buffer_mnnvl.get_multicast_ptr()
@@ -291,18 +300,21 @@ def test_mnnvl_allreduce_full(
         rank_failed = True
         failure_message = f"FAILED[rank={rank}]: seq_lens={seq_lens}, fusion={fusion}, dtype={dtype} failed: {e}"
         print(failure_message)
-        # Gather failure status from all ranks
+
+        # Gather failure status from all ranks for logging
         all_failures = MPI.COMM_WORLD.allgather(rank_failed)
 
-        # If any rank failed, fail the test
         if any(all_failures):
             failed_ranks = [i for i, failed in enumerate(all_failures) if failed]
             if rank == 0:
                 print(f"Test failed on ranks: {failed_ranks}")
 
-            # Fail the test on all ranks
-            pytest.fail(f"Test failed on ranks {failed_ranks}")
-            trtllm_mnnvl_ar.mpi_barrier()
+        # Cleanup before re-raising
+        if "mcast_buffer_mnnvl" in locals():
+            del mcast_buffer_mnnvl
+
+        # Re-raise the original exception so it can be caught by pytest.raises in negative tests
+        raise
 
     finally:
         # Ensure cleanup happens for this list's workspace
@@ -311,3 +323,86 @@ def test_mnnvl_allreduce_full(
 
     # Final synchronization and check for failures across all ranks
     trtllm_mnnvl_ar.mpi_barrier()
+
+
+"""Test with default workspace size"""
+
+
+@pytest.mark.parametrize(
+    "seq_lens",
+    [
+        [1],
+        [4],
+        [15],
+        [27, 11, 24],
+        [127],
+    ],
+)
+@pytest.mark.parametrize("fusion", [False, True])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("hidden_size", [2048, 4096, 5120, 7168, 8192])
+def test_mnnvl_allreduce_default_workspace(
+    monkeypatch, seq_lens: list[int], fusion: bool, dtype: torch.dtype, hidden_size: int
+):
+    """Test MNNVL AllReduce with default workspace size."""
+    run_mnnvl_ar_full(monkeypatch, seq_lens, fusion, dtype, hidden_size)
+
+
+"""Test with explicit workspace size"""
+
+
+@pytest.mark.parametrize(
+    "seq_lens",
+    [
+        [1, 4, 180],
+    ],
+)
+@pytest.mark.parametrize("fusion", [False, True])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("hidden_size", [2048, 4096, 5120, 7168, 8192])
+def test_mnnvl_allreduce_explicit_workspace(
+    monkeypatch, seq_lens: list[int], fusion: bool, dtype: torch.dtype, hidden_size: int
+):
+    """Test MNNVL AllReduce with explicitly calculated workspace size."""
+    # Calculate workspace to fit the maximum sequence length
+    # buffer shape: [3, 2, buffer_tokens, hidden_dim]
+    explicit_workspace_bytes = 3 * 2 * dtype.itemsize * hidden_size * max(seq_lens)
+    run_mnnvl_ar_full(
+        monkeypatch,
+        seq_lens,
+        fusion,
+        dtype,
+        hidden_size,
+        explicit_workspace_bytes=explicit_workspace_bytes,
+    )
+
+
+"""Negative test: workspace too small"""
+
+
+@pytest.mark.parametrize("fusion", [False, True])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("hidden_size", [2048, 4096])
+def test_mnnvl_allreduce_workspace_too_small(
+    monkeypatch, fusion: bool, dtype: torch.dtype, hidden_size: int
+):
+    """Test that MNNVL AllReduce fails gracefully when workspace is too small."""
+    # Use a large sequence length that won't fit in a small workspace
+    seq_len = 180
+
+    # Create a workspace that's too small (only enough for 10 tokens)
+    small_workspace_bytes = 3 * 2 * dtype.itemsize * hidden_size * 10
+
+    # Expect a ValueError with a message about buffer_M being too small
+    with pytest.raises((ValueError, RuntimeError)) as exc_info:
+        run_mnnvl_ar_full(
+            monkeypatch,
+            [seq_len],
+            fusion,
+            dtype,
+            hidden_size,
+            explicit_workspace_bytes=small_workspace_bytes,
+        )
+
+    # Verify the error message contains the expected text
+    assert "greater than the buffer_M" in str(exc_info.value)
diff --git a/tests/comm/test_trtllm_mnnvl_allreduce_custom_comm.py b/tests/comm/test_trtllm_mnnvl_allreduce_custom_comm.py
new file mode 100644
index 0000000000..772ceead0b
--- /dev/null
+++ b/tests/comm/test_trtllm_mnnvl_allreduce_custom_comm.py
@@ -0,0 +1,263 @@
+# Check torch version:
+from typing import Any, Tuple
+
+import multiprocessing as mp
+import socket
+import pytest
+import torch
+import torch.distributed as dist
+
+import flashinfer.comm.trtllm_mnnvl_ar as trtllm_mnnvl_ar
+from flashinfer.comm.mapping import Mapping
+from flashinfer.comm.mnnvl import CommBackend as CommBackend
+
+import pynvml
+
+pynvml.nvmlInit()
+
+
+class CustomCommunicator(CommBackend):
+    def __init__(self, group):
+        self._group = group
+
+    def Get_rank(self) -> int:
+        return dist.get_rank(self._group)
+
+    def Get_size(self) -> int:
+        return dist.get_world_size(self._group)
+
+    def allgather(self, data: int | bytes):
+        device = f"cuda:{torch.cuda.current_device()}"
+        if isinstance(data, int):
+            local_tensor = torch.tensor([data], device=device, dtype=torch.int32)
+            world_size = self.Get_size()
+            gathered = [torch.zeros_like(local_tensor) for _ in range(world_size)]
+
+            dist.all_gather(gathered, local_tensor, group=self._group)
+            return [int(x.item()) for x in gathered]
+
+        elif isinstance(data, bytes):
+            local_tensor = torch.ByteTensor(list(data)).unsqueeze(0).to(device)
+            world_size = self.Get_size()
+            gathered = [data] * self.Get_size()
+            dist.all_gather_object(gathered, data, group=self._group)
+            return gathered
+        else:
+            raise TypeError(f"Unsupported type for allgather: {type(data)}")
+
+    def bcast(self, data, root: int = 0):
+        """
+        Broadcast a picklable Python object from `root` to all ranks.
+        Uses torch.distributed.broadcast_object_list under the hood.
+
+        Returns the broadcasted object on every rank.
+        """
+        obj_list = [data]
+        # broadcast_object_list mutates obj_list in-place
+        dist.broadcast_object_list(obj_list, src=root, group=self._group)
+        return obj_list[0]
+
+    def barrier(self):
+        """
+        Synchronize all ranks in this communicator.
+        """
+        dist.barrier(group=self._group)
+
+    def Split(self, color: int, key: int) -> "CustomCommunicator":
+        return self
+
+
+def get_open_port() -> int:
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("127.0.0.1", 0))
+            return s.getsockname()[1]
+    except OSError:
+        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
+            s.bind(("::1", 0))
+            return s.getsockname()[1]
+
+
+def multi_process_parallel(
+    world_size: int, dtype: torch.dtype, test_target: Any, target_args: tuple = ()
+) -> None:
+    mp.set_start_method("spawn", force=True)
+
+    procs = []
+    distributed_init_port = get_open_port()
+    for i in range(world_size):
+        proc_args = (world_size, i, dtype, distributed_init_port) + target_args
+        proc = mp.Process(target=test_target, args=proc_args, name=f"Worker-{i}")
+        proc.start()
+        procs.append(proc)
+
+    for i in range(world_size):
+        procs[i].join()
+        assert procs[i].exitcode == 0, (
+            f"Process {i} failed with exit code {procs[i].exitcode}"
+        )
+
+
+def _run_mnnvl_ar(world_size, rank, dtype, distributed_init_port, seq_len, hidden_size):
+    # Set CUDA device based on rank
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    distributed_init_method = f"tcp://localhost:{distributed_init_port}"
+    dist.init_process_group(
+        backend="nccl",
+        init_method=distributed_init_method,
+        rank=rank,
+        world_size=world_size,
+    )
+    group = dist.group.WORLD
+    torch.cuda.set_device(rank)
+    comm = CustomCommunicator(group)
+    mapping = Mapping(
+        world_size=world_size,
+        rank=rank,
+        gpus_per_node=world_size,
+        tp_size=world_size,
+    )
+
+    if mapping.local_rank == 0:
+        print(
+            f"[Node {mapping.node_rank}] Running MNNVL AllReduce test with {world_size} ranks"
+        )
+        print(
+            f"[Node {mapping.node_rank}] Rank {rank} using GPU {torch.cuda.current_device()}"
+        )
+
+    tensor_parallel_size = world_size
+    eps = 1e-5
+    torch.manual_seed(42)
+
+    # Track if this rank failed
+    rank_failed = False
+    failure_message = ""
+
+    try:
+        # Get workspace buffers using MPI rank - allocate once per seq_lens list and reuse within the list
+        # This workspace is sized for the maximum expected sequence length and can be reused within each list
+        # Each parameterized list gets its own fresh workspace allocation
+        explicit_workspace_bytes = 3 * 2 * dtype.itemsize * hidden_size * seq_len
+        mcast_buffer_mnnvl, buffer_flags_mnnvl, max_num_elements_mnnvl = (
+            trtllm_mnnvl_ar.get_allreduce_mnnvl_workspace(
+                mapping, dtype, comm, explicit_workspace_bytes
+            )
+        )
+
+        multicast_ptr = mcast_buffer_mnnvl.get_multicast_ptr()
+        buffer_ptrs_dev = mcast_buffer_mnnvl.get_buffer_ptrs_dev()
+        unicast_ptr = mcast_buffer_mnnvl.mcast_device_memory.get_unicast_ptr(
+            mapping.tp_rank
+        )
+
+        # Test each sequence length with the same workspace (reusing allocated buffers within this list)
+        if rank == 0:
+            print(
+                f"Testing seq_len={seq_len}, hidden_size={hidden_size}, dtype={dtype}"
+            )
+
+        # Generate test data (same on all ranks due to same seed)
+        x_full = torch.randn(
+            (tensor_parallel_size, seq_len, hidden_size),
+            dtype=dtype,
+            device=torch.device("cuda"),
+        )
+        residual = torch.randn(
+            (seq_len, hidden_size), dtype=dtype, device=torch.device("cuda")
+        )
+        norm_weight = torch.randn(
+            (hidden_size,), dtype=dtype, device=torch.device("cuda")
+        )
+
+        # Each rank gets its slice of the input
+        x = x_full[rank, :, :]
+
+        # Compute reference output based on fusion mode
+        reference_output: Tuple[torch.Tensor, ...] = None
+
+        # Non-fused case: Only AllReduce
+        allreduce_result = torch.sum(x_full, dim=0)  # AllReduce result
+        reference_output = (allreduce_result,)
+
+        # Run the test with the same workspace
+        from .test_trtllm_mnnvl_allreduce import row_linear_residual_norm_fusion_forward
+
+        row_linear_residual_norm_fusion_forward(
+            x,
+            residual,
+            norm_weight,
+            eps,
+            hidden_size,
+            dtype,
+            mapping,
+            False,
+            reference_output,
+            multicast_ptr,
+            buffer_ptrs_dev,
+            unicast_ptr,
+            max_num_elements_mnnvl,
+            buffer_flags_mnnvl,
+            comm,
+        )
+
+        # Synchronize before next test
+        comm.barrier()
+
+        print(f"PASSED[rank={rank}]: seq_len={seq_len}, dtype={dtype}")
+
+    except Exception as e:
+        rank_failed = True
+        failure_message = (
+            f"FAILED[rank={rank}]: seq_lens={seq_len}, dtype={dtype} failed: {e}"
+        )
+        print(failure_message)
+        # Gather failure status from all ranks
+        all_failures = comm.allgather(rank_failed)
+
+        # If any rank failed, fail the test
+        if any(all_failures):
+            failed_ranks = [i for i, failed in enumerate(all_failures) if failed]
+            if rank == 0:
+                print(f"Test failed on ranks: {failed_ranks}")
+
+            # Fail the test on all ranks
+            pytest.fail(f"Test failed on ranks {failed_ranks}")
+            comm.barrier()
+
+    finally:
+        # Ensure cleanup happens for this list's workspace
+        if "mcast_buffer_mnnvl" in locals():
+            del mcast_buffer_mnnvl
+
+    # Final synchronization and check for failures across all ranks
+    comm.barrier()
+
+
+"""Main test function that runs on each MPI rank"""
+
+
+@pytest.mark.parametrize("world_size", [2, 4])
+def test_mnnvl_allreduce_custom_communicator(
+    monkeypatch,
+    world_size,
+):
+    monkeypatch.setenv("TRTLLM_FORCE_MNNVL_AR", "1")  # force multi-node allreduce.
+    seq_len = 24
+    dtype = torch.bfloat16
+    hidden_size = 2048
+
+    available_gpus = torch.cuda.device_count()
+    if world_size > available_gpus:
+        pytest.skip(
+            f"world_size {world_size} is greater than available_gpus {available_gpus}"
+        )
+    print(f"Running test for world_size={world_size}")
+    multi_process_parallel(
+        world_size,
+        dtype,
+        _run_mnnvl_ar,
+        target_args=(seq_len, hidden_size),
+    )
+    print(f"custom mnnvl allreduce world_size = {world_size}: OK")
diff --git a/tests/conftest.py b/tests/conftest.py
index dc81dc0db2..768eec8fa3 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -137,11 +137,11 @@ def is_cuda_oom_error_str(e: str) -> bool:
     return "CUDA" in e and "out of memory" in e
 
 
-@pytest.hookimpl(tryfirst=True)
+@pytest.hookimpl(wrapper=True)
 def pytest_runtest_call(item):
     # skip OOM error and missing JIT cache errors
     try:
-        item.runtest()
+        yield
     except (torch.cuda.OutOfMemoryError, RuntimeError) as e:
         if isinstance(e, torch.cuda.OutOfMemoryError) or is_cuda_oom_error_str(str(e)):
             pytest.skip("Skipping due to OOM")
diff --git a/tests/gemm/test_cute_dsl_blockscaled_gemm.py b/tests/gemm/test_cute_dsl_blockscaled_gemm.py
index 2eb5abc832..30a59260d2 100644
--- a/tests/gemm/test_cute_dsl_blockscaled_gemm.py
+++ b/tests/gemm/test_cute_dsl_blockscaled_gemm.py
@@ -80,10 +80,12 @@ def test_blockscaled_gemm_python_interface(
 ):
     torch.manual_seed(42)
     device = torch.device("cuda:0")
-    major, minor = torch.cuda.get_device_capability(device)
-
-    if not (major == 10 and minor == 0):
-        pytest.skip("Cute-dsl backend is only supported on SM100.")
+    device_ver = torch.cuda.get_device_capability(device)
+    supported_device_vers = [(10, 0), (10, 3)]
+    if device_ver not in supported_device_vers:
+        pytest.skip(
+            f"Cute-dsl backend is only supported on {supported_device_vers}, skipping {device_ver}."
+        )
 
     l, m = lm
     k, n = kn
diff --git a/tests/gemm/test_cute_dsl_gemm_allreduce_two_shot.py b/tests/gemm/test_cute_dsl_gemm_allreduce_two_shot.py
index 4e087e3c5f..c2ddd4d2ae 100644
--- a/tests/gemm/test_cute_dsl_gemm_allreduce_two_shot.py
+++ b/tests/gemm/test_cute_dsl_gemm_allreduce_two_shot.py
@@ -258,6 +258,8 @@ def run(
         l, m, n, k, a_major, b_major, c_major, ab_dtype, c_dtype, all_reduce != "none"
     )
 
+    major, minor = torch.cuda.get_device_capability()
+
     # Build GEMM object
     gemm = PersistentDenseGemmKernel(
         acc_dtype,
@@ -266,6 +268,7 @@ def run(
         cluster_shape_mn,
         use_tma_store,
         all_reduce=all_reduce,
+        sm_version=f"sm_{major}{minor}",
     )
 
     if not can_implement:
@@ -484,7 +487,7 @@ def test_cute_dsl_gemm_allreduce_two_shot(world_size):
             f"world_size {world_size} is greater than available_gpus {available_gpus}"
         )
 
-    if get_compute_capability(torch.device("cuda")) != (10, 0):
+    if get_compute_capability(torch.device("cuda")) not in [(10, 0), (10, 3)]:
         pytest.skip("cute_dsl_gemm_allreduce_two_shot requires SM100")
 
     print(f"Running test for world_size={world_size}")
diff --git a/tests/gemm/test_group_gemm.py b/tests/gemm/test_group_gemm.py
index fbdd9e26e4..739527f726 100644
--- a/tests/gemm/test_group_gemm.py
+++ b/tests/gemm/test_group_gemm.py
@@ -23,6 +23,7 @@
     has_flashinfer_jit_cache,
     is_sm90a_supported,
 )
+from flashinfer.jit.gemm import gen_gemm_module, gen_gemm_sm90_module
 
 DTYPES = [torch.float16]
 CUDA_DEVICES = ["cuda:0"]
@@ -33,9 +34,9 @@
     scope="module",
 )
 def warmup_jit():
-    jit_specs = [flashinfer.gemm.gen_gemm_module()]
+    jit_specs = [gen_gemm_module()]
     if is_sm90a_supported(torch.device("cuda:0")):
-        jit_specs.append(flashinfer.gemm.gen_gemm_sm90_module())
+        jit_specs.append(gen_gemm_sm90_module())
     flashinfer.jit.build_jit_specs(jit_specs, verbose=False)
     yield
 
diff --git a/tests/gemm/test_mm_fp4.py b/tests/gemm/test_mm_fp4.py
index 4c90bf3fe9..cc85f6126a 100644
--- a/tests/gemm/test_mm_fp4.py
+++ b/tests/gemm/test_mm_fp4.py
@@ -9,19 +9,10 @@
     mxfp4_quantize,
 )
 from flashinfer.utils import get_compute_capability, LibraryError
-from flashinfer.gemm import CUDNN_FP4_MXFP4_SM120_CUDNN_VERSION_ERROR
+from flashinfer.gemm.gemm_base import CUDNN_FP4_MXFP4_SM120_CUDNN_VERSION_ERROR
 
 
-# TODO: Consdier splitting this function up for the various backends
-@pytest.mark.parametrize("m", [1, 48, 128, 256, 512])
-@pytest.mark.parametrize("n", [128, 256, 512])
-@pytest.mark.parametrize("k", [128, 256, 512])
-@pytest.mark.parametrize("res_dtype", [torch.bfloat16, torch.float16])
-@pytest.mark.parametrize("backend", ["trtllm", "cudnn", "cutlass"])
-@pytest.mark.parametrize("use_128x4_sf_layout", [False, True])
-@pytest.mark.parametrize("auto_tuning", [False, True])
-@pytest.mark.parametrize("fp4_type", ["nvfp4", "mxfp4", "mxfp4_alpha"])
-def test_mm_fp4(
+def _test_mm_fp4(
     m, n, k, res_dtype, backend, use_128x4_sf_layout, auto_tuning, fp4_type
 ):
     use_nvfp4 = fp4_type == "nvfp4"
@@ -40,10 +31,8 @@ def test_mm_fp4(
             pytest.skip("trtllm gemm does not support SM110/SM120/SM121 GPUs.")
     if not use_128x4_sf_layout and backend != "trtllm":
         pytest.skip("Skipping test for non-trtllm fp4 with use_128x4_sf_layout=False")
-    if auto_tuning and backend == "cudnn":
-        pytest.skip("Skipping test for cudnn fp4 with auto_tuning=True")
-    if not use_nvfp4 and backend != "cudnn":
-        pytest.skip("mx_fp4 is only supported for cudnn backend")
+    if not use_nvfp4 and backend not in ["cudnn", "auto"]:
+        pytest.skip("mx_fp4 is only supported for cudnn and auto backends")
 
     input = torch.randn([m, k], device="cuda", dtype=torch.bfloat16)
     mat2 = torch.randn([n, k], device="cuda", dtype=torch.bfloat16)
@@ -105,5 +94,38 @@ def test_mm_fp4(
             pytest.fail(str(e))
 
 
+# TODO: Consdier splitting this function up for the various backends
+@pytest.mark.parametrize("m", [1, 48, 128, 256, 512])
+@pytest.mark.parametrize("n", [128, 256, 512])
+@pytest.mark.parametrize("k", [128, 256, 512])
+@pytest.mark.parametrize("res_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("backend", ["trtllm", "cudnn", "cutlass"])
+@pytest.mark.parametrize("use_128x4_sf_layout", [False, True])
+@pytest.mark.parametrize("auto_tuning", [False, True])
+@pytest.mark.parametrize("fp4_type", ["nvfp4", "mxfp4", "mxfp4_alpha"])
+def test_mm_fp4(
+    m, n, k, res_dtype, backend, use_128x4_sf_layout, auto_tuning, fp4_type
+):
+    # Non-auto backends
+    _test_mm_fp4(
+        m, n, k, res_dtype, backend, use_128x4_sf_layout, auto_tuning, fp4_type
+    )
+
+
+# Split tests for checking auto functionality
+@pytest.mark.parametrize("m", [1, 48, 256, 512])
+@pytest.mark.parametrize("n", [256, 512])
+@pytest.mark.parametrize("k", [256, 512])
+@pytest.mark.parametrize("res_dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("use_128x4_sf_layout", [True])
+@pytest.mark.parametrize("auto_tuning", [False, True])
+@pytest.mark.parametrize("fp4_type", ["nvfp4", "mxfp4", "mxfp4_alpha"])
+def test_mm_fp4_backend_auto(
+    m, n, k, res_dtype, use_128x4_sf_layout, auto_tuning, fp4_type
+):
+    # Some test cases for auto backend.
+    _test_mm_fp4(m, n, k, res_dtype, "auto", use_128x4_sf_layout, auto_tuning, fp4_type)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/gemm/test_tgv_gemm.py b/tests/gemm/test_tgv_gemm.py
index ee7fc67926..0296cbbb54 100755
--- a/tests/gemm/test_tgv_gemm.py
+++ b/tests/gemm/test_tgv_gemm.py
@@ -6,7 +6,7 @@
     tgv_gemm_sm100,
 )
 
-from flashinfer.gemm import _match_sm_version
+from flashinfer.gemm.gemm_base import _match_sm_version
 
 
 @pytest.mark.parametrize("m", [1, 8, 16, 32, 64])
diff --git a/tests/model_optimizations/test_dsv3_fused_routing.py b/tests/model_optimizations/test_dsv3_fused_routing.py
new file mode 100644
index 0000000000..e84c9ca884
--- /dev/null
+++ b/tests/model_optimizations/test_dsv3_fused_routing.py
@@ -0,0 +1,501 @@
+"""
+Test for fused_topk_deepseek (DSv3 Fused Routing) Kernel
+
+This test validates the fused_topk_deepseek kernel against a reference implementation,
+accounting for numerical precision and tie-breaking differences.
+
+================================================================================
+DSv3 ROUTING ALGORITHM
+================================================================================
+
+1. Compute: sigmoid(scores) + bias for each expert (biased scores)
+2. Group experts and compute group scores (sum of top-2 experts per group)
+3. Select top-k groups based on group scores
+4. From selected groups, select top-k experts based on biased scores
+5. Normalize selected experts: sigmoid_scores / sum(sigmoid_scores) * scale
+
+================================================================================
+VALIDATION LOGIC FLOW
+================================================================================
+
+The test performs TWO stages of validation for each token:
+
+STAGE 1: EXPERT SELECTION VALIDATION
+-------------------------------------
+Checks if the kernel selected the correct (or acceptably tied) experts.
+
+1. Are kernel_experts == ref_experts (same set)?
+   YES → ✅ VALID (status: "exact")
+         Continue to Stage 2 to validate output values
+   NO  → Continue to step 2
+
+2. Are kernel_groups == ref_groups (same groups selected)?
+   YES → Continue to step 3 (same groups, different experts)
+   NO  → Continue to step 4 (different groups)
+
+3. SAME GROUPS, DIFFERENT EXPERTS
+   Check if the differing experts have tied scores:
+   - Compute score_diff = max(diff_expert_scores) - min(diff_expert_scores)
+   - If score_diff < expert_tie_threshold:
+     → ✅ VALID (status: "tied_experts")
+   - Else:
+     → ❌ INVALID (status: "score_mismatch")
+
+4. DIFFERENT GROUPS
+   a) Are the groups tied?
+      - Compute all group scores (sum of top-2 experts per group)
+      - Check if differing groups have similar scores
+      - If group_score_diff < group_tie_threshold:
+        → Groups are tied, continue to step 4b
+      - Else:
+        → ❌ INVALID (status: "different_groups")
+
+   b) Are the experts correct within kernel's groups?
+      - Compute expected_experts = top-k experts from kernel's selected groups
+      - If kernel_experts == expected_experts:
+        → ✅ VALID (status: "tied_groups")
+      - Else, check if differing experts have tied scores:
+        - Compute score_diff for differing experts
+        - If score_diff < expert_tie_threshold:
+          → ✅ VALID (status: "tied_groups")
+        - Else:
+          → ❌ INVALID (status: "tied_groups_but_wrong_experts")
+
+STAGE 2: OUTPUT VALUE VALIDATION
+---------------------------------
+For tokens where the SAME experts were selected (status: "exact"):
+- Compare kernel output values vs reference output values
+- Both are normalized scores: sigmoid_scores / sum(sigmoid_scores) * scale
+- Check: abs(kernel_values - ref_values) within tolerance
+  - If within tolerance → ✅ VALID
+  - Else → ❌ INVALID (value mismatch)
+
+For tokens where DIFFERENT experts were selected (even if acceptably):
+- SKIP value validation
+- Reason: Different experts → different normalization sum → different values
+- The expert selection validation already confirmed correctness
+
+Tolerance (data-type dependent):
+- bfloat16: rtol=0.1, atol=0.1
+- float16:  rtol=0.05, atol=0.05
+- float32:  rtol=0.01, atol=0.01
+
+================================================================================
+KEY CONCEPTS
+================================================================================
+
+1. **Group Ties**: When two groups have similar group scores (within threshold),
+   selecting either group is valid. The kernel may pick a different group than
+   the reference due to tie-breaking.
+
+2. **Expert Ties**: When experts have similar biased scores (within threshold),
+   selecting any of them is valid. The kernel may pick different experts due
+   to tie-breaking.
+
+3. **Tied Groups → Verify Experts**: When different groups are selected due to
+   ties, we must still verify that the kernel selected the correct top-k experts
+   WITHIN its chosen groups (not compare across different groups).
+
+4. **Float32 Internal Computation**: The kernel computes internally in float32
+   even when inputs are float16/bfloat16. The reference must match this to
+   ensure consistent group/expert selection.
+
+================================================================================
+THRESHOLDS (Data-Type Dependent)
+================================================================================
+
+                    Expert Tie      Group Tie
+                    Threshold       Threshold
+    bfloat16:       1.0             0.05
+    float16:        0.5             0.02
+    float32:        0.2             0.01
+
+Group thresholds are higher because group scores are sums of 2 values,
+accumulating more numerical error.
+
+================================================================================
+"""
+
+import torch
+import pytest
+from flashinfer.dsv3_ops import fused_topk_deepseek
+# from flashinfer.utils import get_compute_capability
+
+
+class DSv3RoutingGroundTruth:
+    """
+    Computes and stores all ground truth data for DSv3 routing.
+    Performs all computations in float32 to match kernel behavior.
+    """
+
+    def __init__(
+        self, scores, bias, n_group, topk_group, topk, routed_scaling_factor, data_type
+    ):
+        self.num_tokens = scores.shape[0]
+        self.num_experts = scores.shape[1]
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.topk = topk
+        self.routed_scaling_factor = routed_scaling_factor
+        self.experts_per_group = self.num_experts // n_group
+        self.device = scores.device
+
+        # Set thresholds based on data type
+        if data_type == torch.bfloat16:
+            self.expert_tie_threshold = 1.0
+            self.group_tie_threshold = 0.05
+        elif data_type == torch.float16:
+            self.expert_tie_threshold = 0.5
+            self.group_tie_threshold = 0.02
+        else:  # float32
+            self.expert_tie_threshold = 0.2
+            self.group_tie_threshold = 0.01
+
+        # Convert to float32 to match kernel's internal computation
+        scores_f32 = scores.to(torch.float32)
+        bias_f32 = bias.to(torch.float32)
+
+        # Compute sigmoid and biased scores
+        self.sigmoid_scores = torch.sigmoid(scores_f32)
+        self.biased_scores = self.sigmoid_scores + bias_f32
+
+        # Reshape for group-wise operations
+        scores_reshaped = self.biased_scores.view(
+            self.num_tokens, n_group, self.experts_per_group
+        )
+
+        # Compute group scores (sum of top-2 experts per group)
+        top2_per_group = torch.topk(
+            scores_reshaped, k=2, dim=-1, largest=True, sorted=True
+        )[0]
+        self.group_scores = torch.sum(top2_per_group, dim=-1)
+
+        # Reference group selection
+        _, self.ref_group_indices = torch.topk(
+            self.group_scores, k=topk_group, dim=-1, largest=True, sorted=True
+        )
+
+        # Identify tied groups for each token
+        self.tied_group_sets = []
+        for token_idx in range(self.num_tokens):
+            tied_groups = set()
+            group_scores_token = self.group_scores[token_idx]
+
+            for g1 in range(n_group):
+                for g2 in range(g1 + 1, n_group):
+                    score_diff = abs(group_scores_token[g1] - group_scores_token[g2])
+                    if score_diff < self.group_tie_threshold:
+                        tied_groups.add(g1)
+                        tied_groups.add(g2)
+
+            self.tied_group_sets.append(tied_groups)
+
+        # Compute reference expert selection and normalization
+        self.ref_expert_indices = torch.zeros(
+            self.num_tokens, topk, dtype=torch.long, device=self.device
+        )
+        self.ref_expert_values = torch.zeros(
+            self.num_tokens, topk, dtype=torch.float32, device=self.device
+        )
+
+        for token_idx in range(self.num_tokens):
+            # Create mask for selected groups
+            group_mask = torch.zeros(n_group, dtype=torch.float32, device=self.device)
+            group_mask[self.ref_group_indices[token_idx]] = 1.0
+            expert_mask = group_mask.repeat_interleave(self.experts_per_group)
+
+            # Mask and select top-k experts
+            masked_biased_scores = self.biased_scores[token_idx] * expert_mask
+            _, topk_idx = torch.topk(
+                masked_biased_scores, k=topk, dim=-1, largest=True, sorted=True
+            )
+
+            # Normalize selected experts
+            selected_sigmoid_scores = self.sigmoid_scores[token_idx][topk_idx]
+            score_sum = selected_sigmoid_scores.sum() + 1e-20
+            normalized_scores = (
+                selected_sigmoid_scores / score_sum * routed_scaling_factor
+            )
+
+            # Sort by normalized scores
+            sorted_vals, sorted_idx = torch.sort(normalized_scores, descending=True)
+            self.ref_expert_values[token_idx] = sorted_vals
+            self.ref_expert_indices[token_idx] = topk_idx[sorted_idx]
+
+    def get_expert_group(self, expert_id):
+        """Return which group an expert belongs to."""
+        return expert_id // self.experts_per_group
+
+    def is_valid_group_selection(self, token_idx, selected_groups):
+        """Check if a set of selected groups is valid (exact match or tied)."""
+        ref_groups = set(self.ref_group_indices[token_idx].tolist())
+        selected_groups_set = set(selected_groups)
+
+        if selected_groups_set == ref_groups:
+            return True, "exact"
+
+        if self.n_group > 1:
+            diff_groups = selected_groups_set.symmetric_difference(ref_groups)
+            tied_groups = self.tied_group_sets[token_idx]
+
+            if diff_groups and diff_groups.issubset(tied_groups):
+                return True, "tied_groups"
+
+        return False, "different_groups"
+
+    def is_valid_expert_selection(self, token_idx, selected_experts):
+        """Check if a set of selected experts is valid (exact match or tied)."""
+        ref_experts = set(self.ref_expert_indices[token_idx].tolist())
+        selected_experts_set = set(selected_experts)
+
+        if selected_experts_set == ref_experts:
+            return True, "exact"
+
+        # Check group-level validity
+        selected_groups = set(self.get_expert_group(e) for e in selected_experts)
+        ref_groups = set(self.ref_group_indices[token_idx].tolist())
+
+        # If different groups selected
+        if selected_groups != ref_groups:
+            is_valid_groups, group_reason = self.is_valid_group_selection(
+                token_idx, list(selected_groups)
+            )
+            if not is_valid_groups:
+                # Groups are different and not tied - invalid
+                return False, group_reason
+
+            # Groups are tied - now check if kernel selected correct top-k within its groups
+            expected_experts_in_kernel_groups = self._get_topk_experts_from_groups(
+                token_idx, list(selected_groups)
+            )
+
+            # Check if kernel's selection matches expected experts (exact or tied)
+            if selected_experts_set != expected_experts_in_kernel_groups:
+                # Different experts - check if they have tied scores
+                diff_experts = selected_experts_set.symmetric_difference(
+                    expected_experts_in_kernel_groups
+                )
+                biased_scores_token = self.biased_scores[token_idx]
+                diff_expert_scores = torch.tensor(
+                    [biased_scores_token[e].item() for e in diff_experts]
+                )
+                score_range = diff_expert_scores.max() - diff_expert_scores.min()
+
+                if score_range >= self.expert_tie_threshold:
+                    # Experts are wrong (not tied) - invalid even though groups are tied
+                    return (
+                        False,
+                        f"tied_groups_but_wrong_experts_score_diff={score_range:.6f}",
+                    )
+
+            # Groups are tied and experts are correct (or acceptably tied)
+            return True, "tied_groups"
+
+        # Same groups but different experts - check expert-level ties
+        diff_experts = selected_experts_set.symmetric_difference(ref_experts)
+        if diff_experts:
+            biased_scores_token = self.biased_scores[token_idx]
+            diff_expert_scores = torch.tensor(
+                [biased_scores_token[e].item() for e in diff_experts]
+            )
+            score_range = diff_expert_scores.max() - diff_expert_scores.min()
+
+            if score_range < self.expert_tie_threshold:
+                return True, "tied_experts"
+            else:
+                return (
+                    False,
+                    f"score_diff={score_range:.6f}_threshold={self.expert_tie_threshold:.6f}",
+                )
+
+        return True, "exact"
+
+    def _get_topk_experts_from_groups(self, token_idx, groups):
+        """
+        Get the expected top-k experts from specified groups.
+        This computes what experts SHOULD be selected if these groups were chosen.
+        """
+        # Create mask for specified groups
+        group_mask = torch.zeros(self.n_group, dtype=torch.float32, device=self.device)
+        for g in groups:
+            group_mask[g] = 1.0
+        expert_mask = group_mask.repeat_interleave(self.experts_per_group)
+
+        # Mask and select top-k experts
+        masked_biased_scores = self.biased_scores[token_idx] * expert_mask
+        _, topk_idx = torch.topk(
+            masked_biased_scores, k=self.topk, dim=-1, largest=True, sorted=True
+        )
+
+        return set(topk_idx.tolist())
+
+
+def validate_expert_selection(ground_truth, topk_indices_kernel, topk_values_kernel):
+    """Validate kernel outputs and provide detailed debug info for failures."""
+    num_tokens = topk_indices_kernel.shape[0]
+    tokens_with_different_experts = set()
+
+    for token_idx in range(num_tokens):
+        kernel_experts = topk_indices_kernel[token_idx].tolist()
+        ref_experts = ground_truth.ref_expert_indices[token_idx].tolist()
+
+        # Same experts - valid
+        if set(kernel_experts) == set(ref_experts):
+            continue
+
+        # Different experts - mark for value comparison skip
+        tokens_with_different_experts.add(token_idx)
+
+        # Validate the selection
+        is_valid, reason = ground_truth.is_valid_expert_selection(
+            token_idx, kernel_experts
+        )
+
+        if not is_valid:
+            return False, tokens_with_different_experts
+
+    return True, tokens_with_different_experts
+
+
+def validate_values(ground_truth, topk_values_kernel, tokens_to_skip, data_type):
+    """Validate that output values match reference within tolerance."""
+    # Set tolerance based on data type
+    if data_type == torch.bfloat16:
+        rtol, atol = 0.1, 0.1
+    elif data_type == torch.float16:
+        rtol, atol = 0.05, 0.05
+    else:  # float32
+        rtol, atol = 0.01, 0.01
+
+    num_tokens = topk_values_kernel.shape[0]
+
+    # Create mask for tokens to check
+    tokens_to_check = torch.ones(num_tokens, dtype=torch.bool)
+    for token_idx in tokens_to_skip:
+        tokens_to_check[token_idx] = False
+
+    if not tokens_to_check.any():
+        return
+
+    # Compare values
+    ref_values = ground_truth.ref_expert_values[tokens_to_check].float()
+    kernel_values = topk_values_kernel[tokens_to_check].float()
+
+    try:
+        torch.testing.assert_close(
+            ref_values,
+            kernel_values,
+            rtol=rtol,
+            atol=atol,
+        )
+    except AssertionError:
+        # Find and report first mismatch
+        for token_idx in range(num_tokens):
+            if not tokens_to_check[token_idx]:
+                continue
+
+            ref_vals = ground_truth.ref_expert_values[token_idx].float()
+            kernel_vals = topk_values_kernel[token_idx].float()
+
+            if not torch.allclose(ref_vals, kernel_vals, rtol=rtol, atol=atol):
+                diff = (kernel_vals - ref_vals).abs()
+                max_diff = diff.max().item()
+                max_diff_idx = diff.argmax().item()
+
+                print(f"\n{'=' * 80}")
+                print(f"VALUE MISMATCH - Token {token_idx}")
+                print(f"{'=' * 80}")
+                print(f"Tolerance: rtol={rtol}, atol={atol}")
+                print(f"Max difference: {max_diff:.6f} at position {max_diff_idx}")
+                print(f"\nReference values: {ref_vals.tolist()}")
+                print(f"Kernel values:    {kernel_vals.tolist()}")
+                print(f"Absolute diff:    {diff.tolist()}")
+                print(
+                    f"Expert indices:   {ground_truth.ref_expert_indices[token_idx].tolist()}"
+                )
+                break
+
+        raise
+
+
+@pytest.mark.parametrize("num_tokens", [1, 8, 16, 64])
+@pytest.mark.parametrize("num_experts", [256, 384])
+@pytest.mark.parametrize("topk", [1, 2, 4, 8])
+@pytest.mark.parametrize("n_group", [1, 2, 4, 8])
+@pytest.mark.parametrize("topk_group", [1, 2, 4, 8])
+@pytest.mark.parametrize("data_type", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("bias_type", [torch.float32, torch.float16, torch.bfloat16])
+def test_dsv3_fused_routing_op(
+    num_tokens, num_experts, topk, n_group, topk_group, data_type, bias_type
+):
+    """
+    Test fused_topk_deepseek kernel against reference implementation.
+
+    Validates:
+    1. Expert selection equivalence (allowing for ties)
+    2. Value correctness within numerical precision tolerance
+    """
+
+    # Skip invalid configurations
+    if topk_group * n_group < topk or topk_group > n_group:
+        pytest.skip(
+            "Invalid configuration: topk_group * n_group < topk or topk_group > n_group"
+        )
+    if n_group > 1:
+        if (
+            topk > 8
+            or num_experts / n_group > 32
+            or num_experts / n_group * topk_group > 128
+        ):
+            pytest.skip("Invalid configuration: exceeds kernel limits for n_group > 1")
+    else:
+        if num_experts > 384 or topk > 8:
+            pytest.skip("Invalid configuration: exceeds kernel limits for n_group = 1")
+
+    # Generate random inputs
+    torch.manual_seed(42)
+    scores = torch.randn(num_tokens, num_experts, device="cuda", dtype=data_type)
+    bias = torch.randn(num_experts, device="cuda", dtype=bias_type)
+    routed_scaling_factor = 1.0
+
+    # Compute ground truth
+    ground_truth = DSv3RoutingGroundTruth(
+        scores.clone(),
+        bias.clone(),
+        n_group,
+        topk_group,
+        topk,
+        routed_scaling_factor,
+        data_type,
+    )
+
+    # Run kernel
+    topk_values = torch.empty(num_tokens, topk, device="cuda", dtype=data_type)
+    topk_indices = torch.zeros(num_tokens, topk, device="cuda", dtype=torch.int32)
+
+    fused_topk_deepseek(
+        scores,
+        bias,
+        n_group,
+        topk_group,
+        topk,
+        routed_scaling_factor,
+        topk_values,
+        topk_indices,
+        launch_with_pdl=True,
+    )
+
+    # Sort kernel outputs for stable comparison
+    sorted_vals, sorted_idx = torch.sort(topk_values, dim=-1, descending=True)
+    topk_indices = topk_indices.gather(1, sorted_idx)
+
+    # Validate expert selection
+    all_valid, tokens_with_different_experts = validate_expert_selection(
+        ground_truth, topk_indices, sorted_vals
+    )
+
+    if not all_valid:
+        pytest.fail("Expert selection mismatch not due to acceptable ties")
+
+    # Validate values
+    validate_values(ground_truth, sorted_vals, tokens_with_different_experts, data_type)
diff --git a/tests/model_optimizations/test_dsv3_router_gemm.py b/tests/model_optimizations/test_dsv3_router_gemm.py
new file mode 100644
index 0000000000..c4c8f1ce7b
--- /dev/null
+++ b/tests/model_optimizations/test_dsv3_router_gemm.py
@@ -0,0 +1,137 @@
+import torch
+import pytest
+from flashinfer.dsv3_ops import mm_M1_16_K7168_N256
+import torch.nn.functional as F
+from flashinfer.utils import get_compute_capability
+
+
+# Positive tests
+@pytest.mark.parametrize("num_tokens", [1, 2, 3, 5, 8, 13, 16])
+@pytest.mark.parametrize("num_experts", [256])
+@pytest.mark.parametrize("hidden_dim", [7168])
+@pytest.mark.parametrize("launch_with_pdl", [True, False])
+def test_dsv3_router_gemm_op(num_tokens, num_experts, hidden_dim, launch_with_pdl):
+    compute_capability = get_compute_capability(torch.device("cuda"))
+    compute_capability_number = compute_capability[0] * 10 + compute_capability[1]
+    if compute_capability_number != 100:
+        pytest.skip("DSv3 Router GEMM is only supported on SM100")
+
+    mat_a = torch.randn(num_tokens, hidden_dim, device="cuda", dtype=torch.bfloat16)
+    mat_b = torch.randn(
+        num_experts, hidden_dim, device="cuda", dtype=torch.bfloat16
+    ).t()  # column major
+    out = torch.randn(num_tokens, num_experts, device="cuda", dtype=torch.float32)
+    mm_M1_16_K7168_N256(mat_a, mat_b, out, launch_with_pdl=launch_with_pdl)
+    ref = mat_a @ mat_b
+
+    cos_sim = F.cosine_similarity(ref.reshape(-1), out.reshape(-1), dim=0)
+    assert cos_sim > 0.99
+
+
+# Negative tests - test values just outside valid ranges
+@pytest.mark.parametrize(
+    "num_tokens,num_experts,hidden_dim,mat_a_dtype,mat_b_dtype,out_dtype,mat_b_transpose,expected_error",
+    [
+        # Invalid num_tokens (must be 1-16)
+        (
+            0,
+            256,
+            7168,
+            torch.bfloat16,
+            torch.bfloat16,
+            torch.float32,
+            True,
+            "num_tokens",
+        ),
+        (
+            17,
+            256,
+            7168,
+            torch.bfloat16,
+            torch.bfloat16,
+            torch.float32,
+            True,
+            "num_tokens",
+        ),
+        # Invalid num_experts (must be 256)
+        (
+            8,
+            255,
+            7168,
+            torch.bfloat16,
+            torch.bfloat16,
+            torch.float32,
+            True,
+            "num_experts",
+        ),
+        (
+            8,
+            257,
+            7168,
+            torch.bfloat16,
+            torch.bfloat16,
+            torch.float32,
+            True,
+            "num_experts",
+        ),
+        # Invalid hidden_dim (must be 7168)
+        (
+            8,
+            256,
+            7167,
+            torch.bfloat16,
+            torch.bfloat16,
+            torch.float32,
+            True,
+            "hidden_dim",
+        ),
+        (
+            8,
+            256,
+            7169,
+            torch.bfloat16,
+            torch.bfloat16,
+            torch.float32,
+            True,
+            "hidden_dim",
+        ),
+        # Invalid dtypes
+        (8, 256, 7168, torch.float32, torch.bfloat16, torch.float32, True, "bfloat16"),
+        (8, 256, 7168, torch.bfloat16, torch.float32, torch.float32, True, "bfloat16"),
+        (8, 256, 7168, torch.bfloat16, torch.bfloat16, torch.bfloat16, True, "float32"),
+        # Invalid stride (mat_b not transposed = row-major instead of column-major)
+        (
+            8,
+            256,
+            7168,
+            torch.bfloat16,
+            torch.bfloat16,
+            torch.float32,
+            False,
+            "column-major",
+        ),
+    ],
+)
+def test_dsv3_router_gemm_op_negative(
+    num_tokens,
+    num_experts,
+    hidden_dim,
+    mat_a_dtype,
+    mat_b_dtype,
+    out_dtype,
+    mat_b_transpose,
+    expected_error,
+):
+    compute_capability = get_compute_capability(torch.device("cuda"))
+    compute_capability_number = compute_capability[0] * 10 + compute_capability[1]
+    if compute_capability_number != 100:
+        pytest.skip("DSv3 Router GEMM is only supported on SM100")
+
+    mat_a = torch.randn(num_tokens, hidden_dim, device="cuda", dtype=mat_a_dtype)
+    mat_b = torch.randn(num_experts, hidden_dim, device="cuda", dtype=mat_b_dtype)
+    if mat_b_transpose:
+        mat_b = mat_b.t()  # column major
+    out = torch.randn(num_tokens, num_experts, device="cuda", dtype=out_dtype)
+
+    with pytest.raises(ValueError, match=expected_error):
+        mm_M1_16_K7168_N256(mat_a, mat_b, out, launch_with_pdl=False)
diff --git a/tests/moe/test_dpsk_fused_moe_fp8.py b/tests/moe/test_dpsk_fused_moe_fp8.py
new file mode 100644
index 0000000000..711e05f234
--- /dev/null
+++ b/tests/moe/test_dpsk_fused_moe_fp8.py
@@ -0,0 +1,762 @@
+import pytest
+import torch
+from flashinfer import shuffle_matrix_a
+from flashinfer.fused_moe.core import convert_to_block_layout
+from flashinfer.autotuner import autotune
+from flashinfer.fused_moe import (
+    WeightLayout,
+    trtllm_fp8_block_scale_moe,
+)
+from .utils import skip_checks, QuantMode
+from flashinfer import GatedActType
+
+
+def dequant_fp8_block_scaled(
+    intermediate_size: int,
+    hidden_size: int,
+    hidden_states: torch.Tensor,
+    hidden_states_scale: torch.Tensor,
+    gemm1_weights: torch.Tensor,
+    gemm1_weights_scale: torch.Tensor,
+    gemm2_weights: torch.Tensor,
+    gemm2_weights_scale: torch.Tensor,
+    routing_logits: torch.Tensor,
+    routing_bias: torch.Tensor,
+    num_experts_global: int,
+    num_local_experts: int,
+):
+    # FP8 block-scale dequantization: float ≈ fp8 * scale
+    H = hidden_size
+    I = intermediate_size  # deepseek v3: 2048
+    E_local = gemm1_weights.shape[0]
+
+    BLOCK = 128
+    E_global = routing_logits.shape[1]
+    T = routing_logits.shape[0]
+
+    assert E_global == num_experts_global, "num_experts_global shape mismatch"
+    assert E_local == num_local_experts, "num_local_experts shape mismatch"
+
+    # Block counts
+    num_hidden_blocks = H // BLOCK  # 56
+    num_intermediate_blocks = I // BLOCK  # 16
+    num_gemm1_out_blocks = (2 * I) // BLOCK  # 32
+
+    # Shape checks
+    assert hidden_states.shape == (T, H)
+    assert hidden_states_scale.shape == (num_hidden_blocks, T)
+    assert gemm1_weights.shape == (E_local, 2 * I, H)
+    assert gemm1_weights_scale.shape == (
+        E_local,
+        num_gemm1_out_blocks,
+        num_hidden_blocks,
+    )
+    assert gemm2_weights.shape == (E_local, H, I)
+    assert gemm2_weights_scale.shape == (
+        E_local,
+        num_hidden_blocks,
+        num_intermediate_blocks,
+    )
+    assert routing_bias.shape[-1] == E_global
+
+    # hidden_states: [T, H], scale: [H/128, T] (transposed layout)
+    A_fp32 = hidden_states.to(torch.float32)
+    A_scale = hidden_states_scale.to(torch.float32)  # [H/128, T]
+    A_scale_TH = A_scale.permute(1, 0).contiguous()  # [T, H/128]
+    A_scale_expanded = (
+        A_scale_TH.unsqueeze(-1)
+        .repeat(1, 1, BLOCK)  # [T, H/128, 128]
+        .reshape(T, H)  # [T, H]
+        .contiguous()
+    )
+    A = A_fp32 * A_scale_expanded  # [T, H] float32
+
+    # W13: [E_local, 2I, H], scale: [E_local, (2I)/128, H/128]
+    W13_fp32 = gemm1_weights.to(torch.float32)
+    S13 = gemm1_weights_scale.to(torch.float32)
+    S13_expanded = torch.repeat_interleave(S13, BLOCK, dim=1)  # [E, 2I, H/128]
+    S13_expanded = torch.repeat_interleave(S13_expanded, BLOCK, dim=2)  # [E, 2I, H]
+    W13 = W13_fp32 * S13_expanded  # [E, 2I, H] float32
+
+    # W2: [E_local, H, I], scale: [E_local, H/128, I/128]
+    W2_fp32 = gemm2_weights.to(torch.float32)
+    S2 = gemm2_weights_scale.to(torch.float32)
+    S2_expanded = torch.repeat_interleave(S2, BLOCK, dim=1)  # [E, H, I/128]
+    S2_expanded = torch.repeat_interleave(S2_expanded, BLOCK, dim=2)  # [E, H, I]
+    W2 = W2_fp32 * S2_expanded  # [E, H, I] float32
+
+    return A, W13, W2
+
+
+def _deepseek_moe_core(
+    routing_logits: torch.Tensor,
+    routing_bias: torch.Tensor,
+    local_expert_offset: int,
+    routed_scaling_factor: float,
+    intermediate_size: int,
+    num_experts_global: int,
+    num_local_experts: int,
+    top_k: int,
+    n_group: int,
+    topk_group: int,
+    hidden_size: int,
+    A: torch.Tensor,
+    W13: torch.Tensor,
+    W2: torch.Tensor,
+):
+    """
+    - DeepSeek-V3 no-aux routing:
+        s = sigmoid(logits)
+        s_with_bias = s + bias
+        group by n_group=8; per group take top-2 sum → pick topk_group=4 groups
+        on the kept groups, take global top_k=8 experts
+        combine with weights derived from s (without bias), normalized and
+        scaled by routed_scaling_factor
+    - Local computation:
+        only experts in [local_expert_offset, local_expert_offset + E_local) are
+        computed on this rank (GEMM1 → SwiGLU → GEMM2), then per-token weighted
+        accumulation.
+    """
+
+    # Routing constants
+    TOP_K = top_k  # deepseek v3: 8
+    N_GROUP = n_group  # deepseek v3: 8
+    TOPK_GROUP = topk_group  # deepseek v3: 4
+
+    I = intermediate_size  # deepseek v3: 2048
+    H = hidden_size  # deepseek v3: 7168
+    E_local = num_local_experts
+    E_global = num_experts_global
+    T = routing_logits.shape[0]
+
+    device = A.device
+
+    # 2) No-aux routing
+    logits = routing_logits.to(torch.float32)  # [T, E_global]
+    bias = routing_bias.to(torch.float32).reshape(-1)  # [E_global]
+
+    # Sigmoid
+    s = 1.0 / (1.0 + torch.exp(-logits))  # [T, E]
+    s_with_bias = s + bias  # [T, E] (broadcast)
+
+    # Grouping
+    group_size = E_global // N_GROUP  # 32
+    s_wb_grouped = s_with_bias.view(T, N_GROUP, group_size)  # [T, 8, 32]
+
+    # Group scores = sum of top-2 values within each group
+    top2_vals, _ = torch.topk(
+        s_wb_grouped, k=2, dim=2, largest=True, sorted=False
+    )  # [T, 8, 2]
+    group_scores = top2_vals.sum(dim=2)  # [T, 8]
+
+    # Select topk_group groups → group mask
+    _, group_idx = torch.topk(
+        group_scores, k=TOPK_GROUP, dim=1, largest=True, sorted=False
+    )  # [T, 4]
+    group_mask = torch.zeros_like(group_scores)  # [T, 8]
+    group_mask.scatter_(1, group_idx, 1.0)
+    score_mask = (
+        group_mask.unsqueeze(2).expand(T, N_GROUP, group_size).reshape(T, E_global)
+    )  # [T, E]
+
+    # Global top-k (within kept groups), based on s_with_bias
+    neg_inf = torch.finfo(torch.float32).min
+    scores_pruned = s_with_bias.masked_fill(score_mask == 0, neg_inf)  # [T, E]
+    _, topk_idx = torch.topk(
+        scores_pruned, k=TOP_K, dim=1, largest=True, sorted=False
+    )  # [T, 8]
+
+    # Combination weights: use s (without bias) for normalization
+    M = torch.zeros_like(s)  # [T, E]
+    M.scatter_(1, topk_idx, 1.0)  # 0/1 mask
+    weights = s * M  # [T, E]
+    weights_sum = weights.sum(dim=1, keepdim=True) + 1e-20
+    weights = (weights / weights_sum) * routed_scaling_factor  # [T, E]
+
+    # 3) Local expert compute and accumulation
+    output = torch.zeros((T, H), dtype=torch.float32, device=device)
+
+    local_start = int(local_expert_offset)
+
+    # For each local expert: find selected tokens, run GEMM1→SwiGLU→GEMM2, accumulate by weights
+    for le in range(E_local):
+        ge = local_start + le
+        if ge < 0 or ge >= E_global:
+            continue
+
+        # Tokens that selected this global expert ge in their top-k
+        sel_mask_per_token = (topk_idx == ge).any(dim=1)  # [T] bool
+        if not sel_mask_per_token.any():
+            continue
+
+        token_idx = torch.nonzero(sel_mask_per_token, as_tuple=False).squeeze(1)  # [Tk]
+
+        # Gather inputs and weights for this expert
+        A_e = A.index_select(0, token_idx)  # [Tk, H]
+        W13_e = W13[le]  # [2I, H]
+        W2_e = W2[le]  # [H, I]
+
+        # GEMM1: [Tk, H] @ [H, 2I] = [Tk, 2I]
+        G1 = A_e.matmul(W13_e.t())  # [Tk, 2I]
+
+        # SwiGLU: split and apply silu(x) = x / (1 + exp(-x))
+        X1 = G1[:, :I]  # [Tk, I]
+        X2 = G1[:, I:]  # [Tk, I]
+        silu_X2 = X2 / (1.0 + torch.exp(-X2))  # [Tk, I]
+        C = silu_X2 * X1  # [Tk, I]
+
+        # GEMM2: [Tk, I] @ [I, H] = [Tk, H]
+        O = C.matmul(W2_e.t())  # [Tk, H]
+
+        # Accumulate with per-token routing weights for this expert
+        w_tok = weights.index_select(0, token_idx)[:, ge]  # [Tk]
+        output.index_add_(0, token_idx, O * w_tok.unsqueeze(1))  # [Tk,H] * [Tk,1]
+
+    return output.to(torch.bfloat16)
+
+
+def run_fp8_block_scale_moe_reference(
+    routing_logits: torch.Tensor,
+    routing_bias: torch.Tensor,
+    hidden_states: torch.Tensor,
+    hidden_states_scale: torch.Tensor,
+    gemm1_weights: torch.Tensor,
+    gemm1_weights_scale: torch.Tensor,
+    gemm2_weights: torch.Tensor,
+    gemm2_weights_scale: torch.Tensor,
+    local_expert_offset: int,
+    routed_scaling_factor: float,
+    intermediate_size: int,
+    num_experts_global: int,
+    num_local_experts: int,
+    top_k: int,
+    n_group: int,
+    topk_group: int,
+    hidden_size: int,
+):
+    I = intermediate_size  # deepseek v3: 2048
+    E_local = gemm1_weights.shape[0]
+    H = hidden_size  # deepseek v3: 7168
+    assert E_local == num_local_experts, "num_local_experts shape mismatch"
+
+    E_global = routing_logits.shape[1]
+    assert E_global == num_experts_global, "num_experts_global shape mismatch"
+
+    # FP8 block-scale dequantization
+    A, W13, W2 = dequant_fp8_block_scaled(
+        hidden_size=H,
+        intermediate_size=I,
+        hidden_states=hidden_states,
+        hidden_states_scale=hidden_states_scale,
+        gemm1_weights=gemm1_weights,
+        gemm1_weights_scale=gemm1_weights_scale,
+        gemm2_weights=gemm2_weights,
+        gemm2_weights_scale=gemm2_weights_scale,
+        routing_logits=routing_logits,
+        routing_bias=routing_bias,
+        num_experts_global=E_global,
+        num_local_experts=E_local,
+    )
+
+    # DeepSeek-V3 no-aux routing
+    output = _deepseek_moe_core(
+        A=A,
+        W13=W13,
+        W2=W2,
+        routing_logits=routing_logits,
+        routing_bias=routing_bias,
+        num_experts_global=E_global,
+        num_local_experts=E_local,
+        top_k=top_k,
+        n_group=n_group,
+        topk_group=topk_group,
+        hidden_size=H,
+        intermediate_size=I,
+        local_expert_offset=local_expert_offset,
+        routed_scaling_factor=routed_scaling_factor,
+    )
+
+    return output
+
+
+# -----------------------------
+# Helpers: FP8 block quantization (dequant scale semantics)
+# -----------------------------
+def _fp8_block_quant_1d(x_bf16: torch.Tensor, block: int = 128):
+    """
+    Quantize [T, H] activations into FP8 with per-(token, 128-col) block scales.
+    Returns:
+      x_fp8: [T, H] (float8_e4m3fn)
+      scales_TxNb: [T, H/128] (float32)  -- dequant scales (float ≈ fp8 * scale)
+    """
+    assert x_bf16.dim() == 2
+    T, H = x_bf16.shape
+    assert H % block == 0
+    nb = H // block
+
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    max_fp8 = finfo.max
+
+    x_f32 = x_bf16.to(torch.float32)
+    x_fp8 = torch.empty((T, H), dtype=torch.float8_e4m3fn, device=x_bf16.device)
+    scales = torch.empty((T, nb), dtype=torch.float32, device=x_bf16.device)
+
+    for j in range(nb):
+        sl = slice(j * block, (j + 1) * block)
+        blk = x_f32[:, sl]  # [T, 128]
+        amax = torch.amax(torch.abs(blk), dim=1)  # [T]
+        # dequant scale s = amax / max_fp8  (float ≈ fp8 * s)
+        s = torch.where(amax > 0, amax / max_fp8, torch.ones_like(amax))
+        q = (blk / s.unsqueeze(1)).to(torch.float8_e4m3fn)  # quantization
+        x_fp8[:, sl] = q
+        scales[:, j] = s
+    return x_fp8, scales  # scales in [T, H/128]
+
+
+def _fp8_block_quant_2d(w_bf16: torch.Tensor, block: int = 128):
+    """
+    Quantize weights with 2D block scales over the last two dims.
+      w_bf16: [*, R, C]  (R and C are multiples of 128)
+    Returns:
+      w_fp8: [*, R, C] (float8_e4m3fn)
+      scales: [*, R/128, C/128] (float32) -- dequant scales
+    """
+    assert w_bf16.dim() >= 2
+    *prefix, R, C = w_bf16.shape
+    assert R % block == 0 and C % block == 0
+    nb_r = R // block
+    nb_c = C // block
+
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    max_fp8 = finfo.max
+
+    w_f32 = w_bf16.to(torch.float32).contiguous()
+    prefix_ndim = len(prefix)
+
+    # Reshape weights into 128x128 blocks and move block dims to the tail:
+    # [..., nb_r, block, nb_c, block] -> [..., nb_r, nb_c, block, block]
+    reshaped = w_f32.reshape(*prefix, nb_r, block, nb_c, block)
+    permute_dims = tuple(range(prefix_ndim)) + (
+        prefix_ndim,
+        prefix_ndim + 2,
+        prefix_ndim + 1,
+        prefix_ndim + 3,
+    )
+    blocks = reshaped.permute(permute_dims).contiguous()
+
+    # Compute per-block scales
+    amax = torch.amax(torch.abs(blocks), dim=(-1, -2))
+    scales = torch.where(
+        amax > 0,
+        amax / max_fp8,
+        torch.ones_like(amax, dtype=torch.float32),
+    )
+
+    # Quantize blocks in parallel
+    q_blocks = (blocks / scales.unsqueeze(-1).unsqueeze(-1)).to(torch.float8_e4m3fn)
+
+    # Restore original layout
+    inv_permute = [0] * (prefix_ndim + 4)
+    for i, d in enumerate(permute_dims):
+        inv_permute[d] = i
+    w_fp8 = q_blocks.permute(*inv_permute).reshape(*prefix, R, C)
+
+    return w_fp8, scales
+
+
+# -----------------------------
+# Random input generator for MoE DS-V3
+# -----------------------------
+def generate_random_inputs_moe(
+    seq_len: int,
+    *,
+    num_experts_global: int = 256,
+    num_local_experts: int = 32,
+    hidden_size: int = 7168,
+    intermediate_size: int = 2048,
+    use_bias: bool = True,
+    local_expert_offset: int = 0,
+    routed_scaling_factor: float = 2.5,
+    device: str = "cuda",
+):
+    assert hidden_size % 128 == 0 and intermediate_size % 128 == 0
+    T, H, I = seq_len, hidden_size, intermediate_size
+    E_global, E_local = num_experts_global, num_local_experts
+
+    # Inputs for routing
+    routing_logits = torch.randn(T, E_global, dtype=torch.float32, device=device)
+    if use_bias:
+        routing_bias = torch.randn(E_global, dtype=torch.bfloat16, device=device)
+    else:
+        routing_bias = torch.zeros(E_global, dtype=torch.bfloat16, device=device)
+
+    # Activations: start from bf16, then FP8 block-quant with dequant scales
+    a_bf16 = 2.0 * torch.randn(T, H, dtype=torch.bfloat16, device=device)
+    a_fp8, a_scales_TxNb = _fp8_block_quant_1d(a_bf16, block=128)  # scales: [T, H/128]
+    hidden_states = a_fp8
+    hidden_states_scale = a_scales_TxNb.transpose(0, 1).contiguous()  # [H/128, T]
+
+    # Weights per local expert
+    # W13: [E_local, 2I, H], W2: [E_local, H, I]
+    w13_bf16 = torch.randn(E_local, 2 * I, H, dtype=torch.bfloat16, device=device)
+    w2_bf16 = torch.randn(E_local, H, I, dtype=torch.bfloat16, device=device)
+
+    w13_fp8, w13_scales = _fp8_block_quant_2d(
+        w13_bf16, block=128
+    )  # scales: [E, (2I)/128, H/128]
+    w2_fp8, w2_scales = _fp8_block_quant_2d(
+        w2_bf16, block=128
+    )  # scales: [E, H/128, I/128]
+
+    return {
+        "routing_logits": routing_logits,
+        "routing_bias": routing_bias,
+        "hidden_states": hidden_states,
+        "hidden_states_scale": hidden_states_scale,
+        "gemm1_weights": w13_fp8,
+        "gemm1_weights_scale": w13_scales,
+        "gemm2_weights": w2_fp8,
+        "gemm2_weights_scale": w2_scales,
+        "local_expert_offset": int(local_expert_offset),
+        "local_num_experts": E_local,
+        "routed_scaling_factor": float(routed_scaling_factor),
+    }
+
+
+def stats_accuracy(
+    ref_out: torch.Tensor,
+    fi_out: torch.Tensor,
+    atol: float = 1e-1,
+    rtol: float = 2e-1,
+    percent: float = 0.85,
+):
+    H = ref_out.shape[1]
+    assert H == 7168
+
+    # Compare
+    ref_f32 = ref_out.float()
+    fi_f32 = fi_out.float()
+
+    abs_diff = (ref_f32 - fi_f32).abs()
+    rel_diff = abs_diff / (fi_f32.abs() + 1e-8)
+
+    print("\nComparison stats:")
+    print(f"Max abs diff:  {abs_diff.max().item():.6e}")
+    print(f"Mean abs diff: {abs_diff.mean().item():.6e}")
+    print(f"Max rel diff:  {rel_diff.max().item():.6e}")
+    print(f"Mean rel diff: {rel_diff.mean().item():.6e}")
+
+    # Cosine similarity and MSE
+    cos_sim = torch.nn.functional.cosine_similarity(
+        ref_f32.flatten(), fi_f32.flatten(), dim=0
+    ).item()
+    mse = torch.mean((ref_f32 - fi_f32) ** 2).item()
+    print(f"Cosine similarity: {cos_sim:.6f}")
+    print(f"MSE: {mse:.6e}")
+
+    # Strict allclose
+    allclose = torch.allclose(ref_f32, fi_f32, atol=atol, rtol=rtol)
+    print(f"\nAllclose(atol={atol}, rtol={rtol}): {allclose}")
+
+    if not allclose:
+        # Show top-5 largest absolute errors
+        flat = abs_diff.flatten()
+        k = min(5, flat.numel())
+        topv, topi = torch.topk(flat, k)
+        print("\nTop-5 absolute error locations:")
+        for rank in range(k):
+            idx = topi[rank].item()
+            t = idx // H
+            h = idx % H
+            print(
+                f"  [t={t}, h={h}]: ref={ref_f32.flatten()[idx].item():.6e}, "
+                f"fi={fi_f32.flatten()[idx].item():.6e}, diff={topv[rank].item():.6e}"
+            )
+
+    left = (ref_f32 - fi_f32).abs()
+    right = atol + rtol * fi_f32.abs()
+    ok = left <= right
+    hit_ratio = ok.float().mean().item()
+    print(f"\nHit ratio: {hit_ratio * 100:.2f}%  (need >= {percent * 100:.2f}%)")
+
+    assert hit_ratio >= percent, (
+        f"Hit ratio {hit_ratio * 100:.2f}% is less than required {percent * 100:.2f}%"
+    )
+
+
+# Max num tokens to tune for trtllm-gen fused moe
+TUNE_MAX_NUM_TOKENS = 4096
+
+
+# -----------------------------
+# Test Entry
+# -----------------------------
+@pytest.mark.parametrize(
+    "seq_len, local_expert_offset, use_bias",
+    [
+        (1, 0, False),
+        (4, 0, True),
+        (8, 64, True),
+        (16, 32, True),
+        (64, 128, True),
+        (256, 64, True),
+        (1024, 32, True),
+    ],
+)
+@pytest.mark.parametrize("intermediate_size", [2048, 1024, 768, 512, 384])
+@pytest.mark.parametrize(
+    "routing_config",
+    [
+        pytest.param(
+            {
+                "num_experts": 384,
+                "top_k": 8,
+                "padding": 8,
+                "n_groups": 1,
+                "top_k_groups": 1,
+                "routed_scaling": 2.5,
+                "compatible_intermediate_size": [1024, 2048],
+                "enable_autotune": True,
+            },
+            id="kimi_k2",
+        ),
+        pytest.param(
+            {
+                "num_experts": 256,
+                "top_k": 8,
+                "padding": 8,
+                "n_groups": 8,
+                "top_k_groups": 4,
+                "routed_scaling": 2.5,
+                "compatible_intermediate_size": [512, 1024, 2048],
+                "enable_autotune": True,
+            },
+            id="DSv3",
+        ),
+        pytest.param(
+            {
+                "num_experts": 72,
+                "top_k": 6,
+                "padding": 8,
+                "n_groups": 1,
+                "top_k_groups": 1,
+                "routed_scaling": 2.5,
+                "compatible_intermediate_size": [384, 768],
+                "enable_autotune": False,
+            },
+            id="DSLite",
+        ),
+    ],
+)
+@pytest.mark.parametrize("enable_pdl", [True, False])
+@pytest.mark.parametrize(
+    "weight_processing",
+    [
+        pytest.param(
+            {
+                "use_shuffled_weight": False,
+                "layout": WeightLayout.MajorK,
+            },
+            id="NoShuffle_MajorK",
+        ),
+        pytest.param(
+            {
+                "use_shuffled_weight": True,
+                "layout": WeightLayout.MajorK,
+            },
+            id="Shuffled_MajorK",
+        ),
+        pytest.param(
+            {
+                "use_shuffled_weight": True,
+                "layout": WeightLayout.BlockMajorK,
+            },
+            id="Shuffled_BlockMajorK",
+        ),
+    ],
+)
+def test_correctness_dpsk_fp8_fused_moe(
+    seq_len: int,
+    local_expert_offset: int,
+    use_bias: bool,
+    intermediate_size: int,
+    routing_config: dict,
+    enable_pdl: bool,
+    weight_processing: dict,
+    atol: float = 1e-1,
+    rtol: float = 2e-1,
+    percent: float = 0.85,
+):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    if trtllm_fp8_block_scale_moe is None:
+        pytest.skip("flashinfer fused_moe kernel not available")
+
+    # Create a mock MoE implementation for skip_checks
+    class FP8BlockScaleMoe:
+        def __init__(self):
+            self.name = "FP8BlockScale"
+            self.quant_mode = QuantMode.FP8_BLOCK_SCALE
+
+    moe_impl = FP8BlockScaleMoe()
+
+    # Make copies of config dicts to avoid modifying the original parametrize values
+    routing_config = dict(routing_config)
+    weight_processing = dict(weight_processing)
+
+    # Ensure they have compatible_moe_impls
+    if "compatible_moe_impls" not in routing_config:
+        routing_config["compatible_moe_impls"] = [type(moe_impl)]
+    if "compatible_moe_impls" not in weight_processing:
+        weight_processing["compatible_moe_impls"] = [type(moe_impl)]
+
+    # Use the complete skip_checks function from utils
+    skip_checks(
+        moe_impl=moe_impl,
+        routing_config=routing_config,
+        weight_processing=weight_processing,
+        gated_act_type=GatedActType.SwiGlu,
+        num_tokens=seq_len,
+        hidden_size=7168,  # DeepSeek-V3 hidden size
+        intermediate_size=intermediate_size,
+    )
+
+    device = "cuda"
+    torch.manual_seed(42)
+
+    # Constants (DeepSeek-V3)
+    E_GLOBAL = routing_config["num_experts"]  # deepseek v3: 256
+    E_LOCAL = 32  # todo(yingyi): default to tp8 for now, update later
+    H = 7168
+    I = intermediate_size  # deepseek v3: 2048
+    TOP_K = routing_config["top_k"]  # deepseek v3: 8
+    N_GROUP = routing_config["n_groups"]  # deepseek v3: 8
+    TOPK_GROUP = routing_config["top_k_groups"]  # deepseek v3: 4
+
+    if local_expert_offset + E_LOCAL > E_GLOBAL:
+        pytest.skip(
+            f"Local expert offset {local_expert_offset} + {E_LOCAL} is greater than number of experts {E_GLOBAL}"
+        )
+
+    # Generate random but consistent inputs
+    inputs = generate_random_inputs_moe(
+        seq_len,
+        num_experts_global=E_GLOBAL,
+        num_local_experts=E_LOCAL,
+        hidden_size=H,
+        intermediate_size=I,
+        use_bias=use_bias,
+        local_expert_offset=local_expert_offset,
+        routed_scaling_factor=routing_config["routed_scaling"],
+        device=device,
+    )
+
+    # Run reference (returns bf16)
+    ref_out = run_fp8_block_scale_moe_reference(
+        routing_logits=inputs["routing_logits"],
+        routing_bias=inputs["routing_bias"],
+        hidden_states=inputs["hidden_states"],
+        hidden_states_scale=inputs["hidden_states_scale"],
+        gemm1_weights=inputs["gemm1_weights"],
+        gemm1_weights_scale=inputs["gemm1_weights_scale"],
+        gemm2_weights=inputs["gemm2_weights"],
+        gemm2_weights_scale=inputs["gemm2_weights_scale"],
+        local_expert_offset=inputs["local_expert_offset"],
+        routed_scaling_factor=inputs["routed_scaling_factor"],
+        hidden_size=H,
+        intermediate_size=I,
+        num_experts_global=E_GLOBAL,
+        num_local_experts=E_LOCAL,
+        top_k=TOP_K,
+        n_group=N_GROUP,
+        topk_group=TOPK_GROUP,
+    )
+
+    # Prepare weights based on weight_processing configuration
+    use_shuffled_weight = weight_processing["use_shuffled_weight"]
+    weight_layout = weight_processing["layout"]
+
+    gemm1_weights = inputs["gemm1_weights"]
+    gemm2_weights = inputs["gemm2_weights"]
+
+    if use_shuffled_weight:
+        # Apply weight shuffling similar to the trtllm_gen_fused_moe test
+        epilogue_tile_m = (
+            64  # todo(yingyi): FIXME: this depends on the kernel internals
+        )
+
+        gemm1_weights_shuffled = []
+        gemm2_weights_shuffled = []
+
+        for i in range(E_LOCAL):
+            # Shuffle weights for better performance
+            tmp_weights1 = shuffle_matrix_a(
+                gemm1_weights[i].view(torch.uint8), epilogue_tile_m
+            )
+            tmp_weights2 = shuffle_matrix_a(
+                gemm2_weights[i].view(torch.uint8), epilogue_tile_m
+            )
+
+            if weight_layout == WeightLayout.BlockMajorK:
+                block_k = 128
+                tmp_weights1 = convert_to_block_layout(tmp_weights1, block_k)
+                tmp_weights2 = convert_to_block_layout(tmp_weights2, block_k)
+
+            gemm1_weights_shuffled.append(tmp_weights1)
+            gemm2_weights_shuffled.append(tmp_weights2)
+
+        gemm1_weights = torch.stack(gemm1_weights_shuffled).view(torch.float8_e4m3fn)
+        gemm2_weights = torch.stack(gemm2_weights_shuffled).view(torch.float8_e4m3fn)
+
+    # Run FlashInfer fused kernel
+    with autotune(routing_config["enable_autotune"]):
+        fi_out = trtllm_fp8_block_scale_moe(
+            inputs["routing_logits"].to(torch.float32),
+            inputs["routing_bias"],  # bf16
+            inputs["hidden_states"],  # fp8
+            inputs["hidden_states_scale"],  # [H/128, T]
+            gemm1_weights,  # fp8 (potentially shuffled)
+            inputs["gemm1_weights_scale"].to(torch.float32),
+            gemm2_weights,  # fp8 (potentially shuffled)
+            inputs["gemm2_weights_scale"].to(torch.float32),
+            E_GLOBAL,
+            TOP_K,
+            N_GROUP,
+            TOPK_GROUP,
+            I,
+            inputs["local_expert_offset"],
+            inputs["local_num_experts"],
+            inputs["routed_scaling_factor"],
+            routing_method_type=2,  # DeepSeek-styled
+            use_shuffled_weight=use_shuffled_weight,
+            weight_layout=weight_layout,
+            enable_pdl=enable_pdl,
+            tune_max_num_tokens=TUNE_MAX_NUM_TOKENS,
+        )
+
+    stats_accuracy(ref_out, fi_out, atol=atol, rtol=rtol, percent=percent)
+
+
+if __name__ == "__main__":
+    test_correctness_dpsk_fp8_fused_moe(
+        seq_len=1,
+        local_expert_offset=0,
+        use_bias=False,
+        intermediate_size=2048,
+        routing_config={
+            "num_experts": 256,
+            "top_k": 8,
+            "padding": 8,
+            "n_groups": 8,
+            "top_k_groups": 4,
+            "routed_scaling": 2.5,
+            "compatible_intermediate_size": [512, 1024, 2048],
+            "enable_autotune": True,
+        },
+        enable_pdl=True,
+        weight_processing={
+            "use_shuffled_weight": False,
+            "layout": WeightLayout.MajorK,
+        },
+    )
diff --git a/tests/moe/test_trtllm_cutlass_fused_moe.py b/tests/moe/test_trtllm_cutlass_fused_moe.py
index b9b79a4028..4bb687d4b4 100644
--- a/tests/moe/test_trtllm_cutlass_fused_moe.py
+++ b/tests/moe/test_trtllm_cutlass_fused_moe.py
@@ -17,6 +17,7 @@
 from contextlib import nullcontext
 
 import pytest
+from flashinfer.fused_moe.core import ActivationType
 import torch
 from torch.nn import functional as F
 
@@ -115,6 +116,30 @@ def break_fp4_bytes(a, dtype):
     return values.reshape(m, n * 2).to(dtype=dtype)
 
 
+def break_int4_bytes_to_int8(packed):
+    low = (packed & 0x0F).to(torch.int8)
+    high = ((packed >> 4) & 0x0F).to(torch.int8)
+    low = torch.where(low >= 8, low - 16, low)
+    high = torch.where(high >= 8, high - 16, high)
+    return torch.stack([low, high], dim=-1).reshape(packed.shape[0], -1)
+
+
+def dequantize_int4_to_dtype(
+    packed_weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    group_size: int,
+    dtype: torch.dtype,
+    weight_scale_2: torch.Tensor = None,
+) -> torch.Tensor:
+    # unpack: [N, K//2] -> [N, K]
+    unpacked = break_int4_bytes_to_int8(packed_weight)
+    scale_expanded = weight_scale.repeat_interleave(group_size, dim=1)
+    dequant = unpacked.float() * scale_expanded.float()
+    if weight_scale_2 is not None:
+        dequant = dequant / weight_scale_2.float()
+    return dequant.to(dtype)
+
+
 def compute_routing(
     router_logits: torch.Tensor, top_k: int
 ) -> tuple[torch.Tensor, torch.Tensor]:
@@ -137,7 +162,7 @@ def compute_routing(
     return routing_weights, selected_experts
 
 
-def torch_moe_nvfp4(a, w1, w2, topk, topk_weight, topk_ids):
+def torch_moe_nvfp4(a, w1, w2, topk, topk_weight, topk_ids, activation_type):
     B, D = a.shape
     a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
     out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
@@ -147,13 +172,26 @@ def torch_moe_nvfp4(a, w1, w2, topk, topk_weight, topk_ids):
     topk_ids = topk_ids.view(-1)
     # w1 needs to be swapped in terms of gate and up_proj
 
+    if activation_type == ActivationType.Swiglu:
+
+        def act(weight, mask):
+            m = weight.shape[0]
+            assert m % 2 == 0
+            w1_expert, w3_expert = weight[m // 2 :, :], weight[: m // 2, :]
+            return F.silu(a[mask] @ w1_expert.t()) * (a[mask] @ w3_expert.t())
+
+    elif activation_type == ActivationType.Relu2:
+
+        def act(weight, mask):
+            return F.relu(a[mask] @ weight.t()) ** 2
+
+    else:
+        raise ValueError(f"Unsupported activation type {activation_type}")
+
     for i in range(w1.shape[0]):
         mask = topk_ids == i
         if mask.sum():
-            m = w1[i].shape[0]
-            assert m % 2 == 0
-            w1_expert, w3_expert = w1[i][m // 2 :, :], w1[i][: m // 2, :]
-            inter = F.silu(a[mask] @ w1_expert.t()) * (a[mask] @ w3_expert.t())
+            inter = act(w1[i], mask)
             inter_gs = torch.tensor(1.0).cuda()
             inter_q, inter_blockscale = fp4_quantize(inter, inter_gs)
             inter = dequantize_nvfp4_to_dtype(
@@ -170,6 +208,77 @@ def torch_moe_nvfp4(a, w1, w2, topk, topk_weight, topk_ids):
     ).sum(dim=1)
 
 
+def torch_moe_w4a8(
+    num_experts,
+    x,
+    w31_weight,
+    w2_weight,
+    selected_experts,
+    routing_weights,
+    fc1_input_scale,
+    fc2_input_scale,
+    fc1_pre_quant_scale,
+    fc2_pre_quant_scale,
+    fc1_weight_scale_2,
+    fc2_weight_scale_2,
+):
+    dtype = x.dtype
+    results = torch.zeros_like(x)
+
+    for expert_id in range(num_experts):
+        mask = selected_experts == expert_id
+        if not mask.sum():
+            continue
+        batch_idx, nth_expert = torch.where(mask)
+
+        w31_expert = w31_weight[expert_id]  # [2N, K]
+        w2_expert = w2_weight[expert_id]  # [K, N]
+        w3_expert, w1_expert = torch.chunk(w31_expert, 2, dim=0)
+
+        expert_inputs = x[batch_idx]
+        if fc1_input_scale is not None:
+            scale1 = fc1_input_scale[expert_id]
+
+        if fc1_pre_quant_scale is not None:
+            expert_inputs_scaled = expert_inputs * fc1_pre_quant_scale[expert_id]
+        else:
+            expert_inputs_scaled = expert_inputs
+        inp_q = (
+            torch.clamp(expert_inputs_scaled / scale1, -448.0, 448.0)
+            .to(torch.float8_e4m3fn)
+            .to(dtype)
+        )
+        x1 = (inp_q @ w1_expert.t()) * scale1
+        x2 = (inp_q @ w3_expert.t()) * scale1
+        if fc1_weight_scale_2 is not None:
+            ws2 = fc1_weight_scale_2[expert_id]
+            x1 = x1 * ws2.to(dtype)
+            x2 = x2 * ws2.to(dtype)
+
+        inter = F.silu(x1) * x2
+
+        if fc2_input_scale is not None:
+            scale2 = fc2_input_scale[expert_id]
+        if fc2_pre_quant_scale is not None:
+            inter_scaled = inter * fc2_pre_quant_scale[expert_id]
+        else:
+            inter_scaled = inter
+        inter_q = (
+            torch.clamp(inter_scaled / scale2, -448.0, 448.0)
+            .to(torch.float8_e4m3fn)
+            .to(dtype)
+        )
+        output = (inter_q @ w2_expert.t()) * scale2
+
+        if fc2_weight_scale_2 is not None:
+            ws2 = fc2_weight_scale_2[expert_id]
+            output = output * ws2.to(dtype)
+
+        results[batch_idx] += routing_weights[batch_idx, nth_expert, None] * output
+
+    return results.view_as(x)
+
+
 def compute_with_experts(
     num_experts,
     x,
@@ -363,6 +472,11 @@ def test_moe_fp8(
     [(torch.float16, torch.float8_e4m3fn), (torch.bfloat16, torch.float8_e4m3fn)],
 )
 @pytest.mark.parametrize("quantized_input", [False, True])
+@pytest.mark.parametrize(
+    "activation_type",
+    [ActivationType.Swiglu, ActivationType.Relu2],
+    ids=["swiglu", "relu2"],
+)
 @pytest.mark.skipif(
     torch.cuda.get_device_capability()[0] not in [10, 11, 12],
     reason="NVFP4 is only supported on SM100, SM110 and SM120",
@@ -376,6 +490,7 @@ def test_moe_nvfp4(
     otype,
     wtype,
     quantized_input,
+    activation_type,
 ):
     # Skip invalid configurations
     if top_k > num_experts:
@@ -391,10 +506,10 @@ def test_moe_nvfp4(
     n = intermediate_size
     k = hidden_size
 
-    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=otype) / 10
-    w1_cutlass = torch.cat((w1[:, n:, :], w1[:, :n, :]), dim=1).contiguous()
+    w1_n = 2 * n if activation_type == ActivationType.Swiglu else n
+    w1 = torch.randn((e, w1_n, k), device="cuda", dtype=otype) / 10
 
-    sf_w1_2n = round_up(2 * n, 128)
+    sf_w1_2n = round_up(w1_n, 128)
     sf_w1_k = round_up(k // quant_blocksize, 4)
     w1_blockscale = torch.empty(
         (e, sf_w1_2n, sf_w1_k), device="cuda", dtype=torch.float8_e4m3fn
@@ -409,8 +524,8 @@ def test_moe_nvfp4(
     w2_blockscale = torch.empty(
         (e, sf_w2_k, sf_w2_n), device="cuda", dtype=torch.float8_e4m3fn
     )
-    w1_q = torch.empty((e, 2 * n, k // 2), device="cuda", dtype=torch.uint8)
-    w1_q_cutlass = torch.empty((e, 2 * n, k // 2), device="cuda", dtype=torch.uint8)
+    w1_q = torch.empty((e, w1_n, k // 2), device="cuda", dtype=torch.uint8)
+    w1_q_cutlass = torch.empty((e, w1_n, k // 2), device="cuda", dtype=torch.uint8)
     w2_q = torch.empty((e, k, n // 2), device="cuda", dtype=torch.uint8)
     w1_gs = torch.empty((e,), device="cuda", dtype=torch.float32)
     w2_gs = torch.empty((e,), device="cuda", dtype=torch.float32)
@@ -424,7 +539,7 @@ def test_moe_nvfp4(
         w1_q[expert], w1_blockscale[expert] = fp4_quantize(w1[expert], w1_gs[expert])
 
         w1_q_cutlass[expert], w1_blockscale_cutlass[expert] = fp4_quantize(
-            w1_cutlass[expert], w1_gs[expert]
+            w1[expert], w1_gs[expert]
         )
 
         w2_q[expert], w2_blockscale[expert] = fp4_quantize(w2[expert], w2_gs[expert])
@@ -469,6 +584,7 @@ def test_moe_nvfp4(
         quant_scales=quant_scales,
         input_sf=input_sf,
         output=flash_output,
+        activation_type=activation_type,
     )
 
     # Ref check
@@ -483,7 +599,7 @@ def test_moe_nvfp4(
         block_size=quant_blocksize,
     )
 
-    w1_d = torch.empty((e, 2 * n, k), device="cuda", dtype=otype)
+    w1_d = torch.empty((e, w1_n, k), device="cuda", dtype=otype)
     w2_d = torch.empty((e, k, n), device="cuda", dtype=otype)
 
     for idx in range(0, e):
@@ -504,12 +620,14 @@ def test_moe_nvfp4(
             block_size=quant_blocksize,
         )
 
-    w1_q_cutlass = torch.cat((w1_q[:, n:, :], w1_q[:, :n, :]), dim=1).contiguous()
-    w1_blockscale_cutlass = torch.cat(
-        (w1_blockscale[:, n:, :], w1_blockscale[:, :n, :]), dim=1
-    ).contiguous()
     ref_output = torch_moe_nvfp4(
-        a_in_dtype, w1_d, w2_d, top_k, routing_weights, selected_experts
+        a_in_dtype,
+        w1_d,
+        w2_d,
+        top_k,
+        routing_weights,
+        selected_experts,
+        activation_type,
     )
     torch.testing.assert_close(ref_output, flash_output, rtol=2e-1, atol=2e-1)
 
@@ -1086,8 +1204,8 @@ def dequant_mxfp4_batches(
     ("alpha", "beta", "limit"), [(None, None, None), (0.5, 0.0, 7.0), (1.702, 1.0, 7.0)]
 )
 @pytest.mark.skipif(
-    torch.cuda.get_device_capability()[0] not in [10, 11],
-    reason="MXFP8xMXFP4 is only supported on SM100 and SM110",
+    torch.cuda.get_device_capability()[0] not in [10, 11, 12],
+    reason="MXFP8xMXFP4 is only supported on SM100, SM110 and SM120",
 )
 def test_moe_mxfp8_mxfp4(
     batch_size,
@@ -1342,5 +1460,177 @@ def test_moe_bf16_mxfp4(
     torch.testing.assert_close(ref_output, flash_output, rtol=1e-1, atol=1e-1)
 
 
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
+@pytest.mark.parametrize("top_k", TOP_K_VALUES)
+@pytest.mark.parametrize("intermediate_size", INTERMEDIATE_SIZES)
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+def test_moe_w4a8(
+    batch_size: int,
+    hidden_size: int,
+    num_experts: int,
+    top_k: int,
+    intermediate_size: int,
+    dtype: torch.dtype,
+):
+    """Test MoE with W4A8 quantization (INT4 weights, FP8 activations)."""
+    if torch.cuda.get_device_capability()[0] != 9:
+        pytest.skip("W4A8 is only supported on SM90")
+    if top_k > num_experts:
+        pytest.skip("top_k must be <= num_experts")
+
+    torch.manual_seed(42)
+    group_size = 128
+    e = num_experts
+    m = batch_size
+    n = intermediate_size
+    k = hidden_size
+    affine_coeff = 0.005
+
+    x = torch.randn(m, k, dtype=dtype, device="cuda")
+    router_logits = torch.randn(m, e, dtype=dtype, device="cuda")
+    w1_weight = torch.randint(0, 256, (e, n, k // 2), dtype=torch.uint8, device="cuda")
+    w2_weight = torch.randint(0, 256, (e, k, n // 2), dtype=torch.uint8, device="cuda")
+    w3_weight = torch.randint(0, 256, (e, n, k // 2), dtype=torch.uint8, device="cuda")
+
+    # per group weight
+    w1_scale = (
+        torch.randn(e, n, k // group_size, dtype=dtype, device="cuda") * affine_coeff
+    )
+    w2_scale = (
+        torch.randn(e, k, n // group_size, dtype=dtype, device="cuda") * affine_coeff
+    )
+    w3_scale = (
+        torch.randn(e, n, k // group_size, dtype=dtype, device="cuda") * affine_coeff
+    )
+
+    # per channel pre quant scales
+    w1_pre_quant_scale = torch.rand(e, k, dtype=dtype, device="cuda") * 0.1 + 0.95
+    w2_pre_quant_scale = torch.rand(e, n, dtype=dtype, device="cuda") * 0.1 + 0.95
+    w3_pre_quant_scale = torch.rand(e, k, dtype=dtype, device="cuda") * 0.1 + 0.95
+
+    input_scale = torch.rand(e, 1, dtype=torch.float32, device="cuda") * 0.2 + 0.1
+    weight_scale_2 = torch.ones(e, 1, dtype=torch.float32, device="cuda")
+
+    fc1_weights = torch.cat([w3_weight, w1_weight], dim=1)
+    fc2_weights = w2_weight
+
+    def interleave_weights(w: torch.Tensor, dim: int) -> torch.Tensor:
+        # Factors are chosen based on TRTLLM's quantization.py
+        interleave_factor = 4 if dim % 512 == 0 else (2 if dim % 256 == 0 else 1)
+        s = w.shape
+        w_interleaved = (
+            w.reshape(s[0], s[1], s[2] // interleave_factor, interleave_factor)
+            .permute(0, 2, 1, 3)
+            .reshape(s[0], s[2] // interleave_factor, s[1] * interleave_factor)
+            .contiguous()
+        )
+        return w_interleaved
+
+    w3_w1_scales = torch.cat([w3_scale, w1_scale], dim=1)
+    w3_w1_scales_int = interleave_weights(w3_w1_scales, k)
+    w2_scales_int = interleave_weights(w2_scale, n)
+
+    # act scales
+    w3_w1_pre_quant_max = torch.max(w1_pre_quant_scale, w3_pre_quant_scale)
+    w3_w1_input_scale_max = input_scale.max()
+    fc31_act_scale = (w3_w1_pre_quant_max / w3_w1_input_scale_max).to(dtype)
+    fc2_act_scale = (w2_pre_quant_scale / input_scale).to(dtype).unsqueeze(-1)
+
+    fc31_alpha = (weight_scale_2.squeeze(-1) * w3_w1_input_scale_max).float()
+    fc2_alpha = (weight_scale_2.squeeze(-1) * input_scale.squeeze(-1)).float()
+
+    zero_1 = torch.empty(0, dtype=dtype, device="cuda")
+    zero_2 = torch.empty(0, dtype=dtype, device="cuda")
+
+    # SM90 requires bfloat16 bit patterns
+    sm = (
+        torch.cuda.get_device_capability()[0] * 10
+        + torch.cuda.get_device_capability()[1]
+    )
+    if sm >= 90:
+        w3_w1_scales_out = w3_w1_scales_int.to(torch.bfloat16).view(dtype)
+        w2_scales_out = w2_scales_int.to(torch.bfloat16).view(dtype)
+        fc31_act_out = fc31_act_scale.to(torch.bfloat16).view(dtype)
+        fc2_act_out = fc2_act_scale.to(torch.bfloat16).view(dtype)
+    else:
+        w3_w1_scales_out = w3_w1_scales_int.to(dtype)
+        w2_scales_out = w2_scales_int.to(dtype)
+        fc31_act_out = fc31_act_scale
+        fc2_act_out = fc2_act_scale
+
+    quant_scales = (
+        w3_w1_scales_out,
+        w2_scales_out,
+        fc31_act_out,
+        fc2_act_out,
+        zero_1,
+        zero_2,
+        fc31_alpha,
+        fc2_alpha,
+    )
+
+    routing_weights, selected_experts = compute_routing(router_logits, top_k)
+    selected_experts_int32 = selected_experts.to(torch.int32)
+
+    flash_output = torch.zeros_like(x)
+    _ = fused_moe.cutlass_fused_moe(
+        x,
+        selected_experts_int32,
+        routing_weights,
+        fc1_weights.view(torch.uint8),
+        fc2_weights.view(torch.uint8),
+        dtype,
+        quant_scales=quant_scales,
+        use_w4_group_scaling=True,
+        output=flash_output,
+        use_packed_weights=True,
+    )
+
+    w31_weight_list = []
+    w2_weight_list = []
+
+    for e_idx in range(num_experts):
+        w1_w = w1_weight[e_idx]  # [N, K//2]
+        w3_w = w3_weight[e_idx]  # [N, K//2]
+        w2_w = w2_weight[e_idx]  # [K, N//2]
+        w1_s = w1_scale[e_idx]  # [N, K//group_size]
+        w3_s = w3_scale[e_idx]  # [N, K//group_size]
+        w2_s = w2_scale[e_idx]  # [K, N//group_size]
+        ws2 = weight_scale_2[e_idx]  # [1]
+
+        # dequant w1 and w3: [N, K//2] -> [N, K]
+        w1_dequant = dequantize_int4_to_dtype(w1_w, w1_s, group_size, dtype, ws2)
+        w3_dequant = dequantize_int4_to_dtype(w3_w, w3_s, group_size, dtype, ws2)
+
+        # dequant w2: [K, N//2] -> [K, N]
+        w2_dequant = dequantize_int4_to_dtype(w2_w, w2_s, group_size, dtype, ws2)
+
+        w31 = torch.cat([w3_dequant, w1_dequant], dim=0)  # [2N, K]
+
+        w31_weight_list.append(w31)
+        w2_weight_list.append(w2_dequant)
+
+    w31_weight_dequant = torch.stack(w31_weight_list, dim=0)  # [e, 2N, K]
+    w2_weight_dequant = torch.stack(w2_weight_list, dim=0)  # [e, K, N]
+
+    ref_output = torch_moe_w4a8(
+        num_experts,
+        x,
+        w31_weight_dequant,
+        w2_weight_dequant,
+        selected_experts,
+        routing_weights,
+        fc1_input_scale=input_scale.squeeze(-1),
+        fc2_input_scale=input_scale.squeeze(-1),
+        fc1_pre_quant_scale=torch.max(w1_pre_quant_scale, w3_pre_quant_scale),
+        fc2_pre_quant_scale=w2_pre_quant_scale,
+        fc1_weight_scale_2=weight_scale_2.squeeze(-1),
+        fc2_weight_scale_2=weight_scale_2.squeeze(-1),
+    )
+    torch.testing.assert_close(ref_output, flash_output, rtol=1e-2, atol=1e-1)
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])
diff --git a/tests/moe/test_trtllm_gen_fused_moe.py b/tests/moe/test_trtllm_gen_fused_moe.py
index a093d4c0aa..c1ee02b91f 100644
--- a/tests/moe/test_trtllm_gen_fused_moe.py
+++ b/tests/moe/test_trtllm_gen_fused_moe.py
@@ -14,10 +14,9 @@
 limitations under the License.
 """
 
+import pytest
 from abc import ABC, abstractmethod
-from enum import IntEnum
 from typing import Dict
-import pytest
 import torch
 from cuda.bindings import runtime
 from torch.nn import functional as F
@@ -40,12 +39,18 @@
     trtllm_fp4_block_scale_moe,
     trtllm_fp8_block_scale_moe,
     trtllm_fp8_per_tensor_scale_moe,
+    trtllm_bf16_moe,
+    trtllm_mxint4_block_scale_moe,
 )
 from flashinfer.fused_moe.core import (
     get_w2_permute_indices_with_cache,
     _maybe_get_cached_w3_w1_permute_indices,
 )
-from flashinfer.utils import calculate_tile_tokens_dim, get_compute_capability
+from .utils import skip_checks, QuantMode
+
+
+# Max num tokens to tune for trtllm-gen fused moe
+TUNE_MAX_NUM_TOKENS = 4096
 
 
 def check_cuda(err):
@@ -75,6 +80,7 @@ def __init__(self, moe_impl, static_data, **config):
         self.moe_impl = moe_impl
         self.static_data = static_data
         self.config = config
+        self.enable_autotune = config.get("enable_autotune", True)
         self.graph = None
         self.graph_exec = None
         self.stream = None
@@ -105,7 +111,7 @@ def capture(self, hidden_states_sample, **runtime_args):
         self.input_tensor = hidden_states_sample.clone()
 
         # Warmup
-        with torch.cuda.stream(torch_stream), autotune(True):
+        with torch.cuda.stream(torch_stream), autotune(self.enable_autotune):
             for _ in range(1):
                 self._run_moe_computation(runtime_args)
 
@@ -202,24 +208,14 @@ def _run_moe_computation(self, runtime_args):
             local_expert_offset=0,
             local_num_experts=self.config["num_experts"],
             routed_scaling_factor=self.config["routed_scaling"],
-            tile_tokens_dim=self.config["tile_tokens_dim"],
             routing_method_type=self.config["routing_method_type"],
             gated_act_type=self.config["gated_act_type"],
             do_finalize=True,
+            tune_max_num_tokens=TUNE_MAX_NUM_TOKENS,
         )
         return output  # Extract tensor from tuple
 
 
-class QuantMode(IntEnum):
-    """Supported quantization modes for MoE testing."""
-
-    FP4_NVFP4_NVFP4 = 1
-    FP4_MXFP4_MXFP8 = 2
-    FP4_MXFP4_Bf16 = 3
-    FP8_BLOCK_SCALE = 4
-    FP8_PER_TENSOR = 5
-
-
 # ====================================================================================
 # Abstract Base Class for MoE Implementations
 # ====================================================================================
@@ -549,7 +545,7 @@ def call_moe(
         routed_scaling = kwargs["routed_scaling"]
         gated_act_type = kwargs["gated_act_type"]
         routing_method_type = kwargs["routing_method_type"]
-        tile_tokens_dim = kwargs["tile_tokens_dim"]
+        enable_autotune = kwargs.get("enable_autotune", True)
 
         # Create CUDA graph configuration
         config = {
@@ -560,9 +556,9 @@ def call_moe(
             "top_k_groups": top_k_groups,
             "intermediate_size": intermediate_size,
             "routed_scaling": routed_scaling,
-            "tile_tokens_dim": tile_tokens_dim,
             "gated_act_type": gated_act_type,
             "routing_method_type": routing_method_type,
+            "enable_autotune": enable_autotune,
         }
 
         runtime_args = {
@@ -587,6 +583,216 @@ def get_tolerances(self):
         return {"atol": 0.1, "rtol": 0.85, "percent": 0.925}
 
 
+# ====================================================================================
+# MxInt4 Block Scale Quantization Implementation
+# ====================================================================================
+
+
+def mxint4_quantize(
+    x: torch.Tensor, sf_vec_size: int = 32
+) -> tuple[torch.Tensor, torch.Tensor]:
+    x_reshaped = x.reshape(-1, sf_vec_size)
+    x_max = x_reshaped.max(dim=-1, keepdim=True)[0].to(torch.float32)
+    x_min = x_reshaped.min(dim=-1, keepdim=True)[0].to(torch.float32)
+    x_max = x_max * 8.0 / 7.0
+    amax = torch.where(x_max > -x_min, x_max, -x_min)
+    scales = amax / 8.0
+    x_scaled = x_reshaped * scales.reciprocal()
+    x_int8 = (
+        x_scaled.round().clamp(-8, 7).to(torch.int8).reshape(-1, sf_vec_size // 2, 2)
+    )
+    x_int4 = (x_int8[..., 0] & 0x0F) | ((x_int8[..., 1] & 0x0F) << 4)
+    return x_int4.reshape(*x.shape[:-1], x.shape[-1] // 2), scales.reshape(
+        -1, sf_vec_size
+    )
+
+
+class MxInt4BlockScaleMoe(Moe):
+    """MxInt4 MoE implementation with block scaling (DeepSeek style)."""
+
+    def quantize_weights(self, gemm1_weights, gemm2_weights, hidden_states_sample):
+        """Quantize weights to MxInt4 with block scaling."""
+        num_experts = gemm1_weights.shape[0]
+        intermediate_size = gemm1_weights.shape[1] // 2
+        hidden_size = gemm1_weights.shape[
+            2
+        ]  # [num_experts, 2*intermediate_size, hidden_size]
+
+        # Quantize weights to MxInt4
+        sf_vec_size = 32
+        gemm1_weights_int4, gemm1_scales = mxint4_quantize(gemm1_weights, sf_vec_size)
+        gemm2_weights_int4, gemm2_scales = mxint4_quantize(gemm2_weights, sf_vec_size)
+        gemm1_scales = gemm1_scales.to(torch.bfloat16).reshape(
+            num_experts,
+            2 * intermediate_size,
+            hidden_size // sf_vec_size,
+        )
+        gemm2_scales = gemm2_scales.to(torch.bfloat16).reshape(
+            num_experts, hidden_size, intermediate_size // sf_vec_size
+        )
+        return {
+            "hidden_states_scale_global": None,
+            "gemm1_weights": gemm1_weights_int4,
+            "gemm2_weights": gemm2_weights_int4,
+            "gemm1_scales": gemm1_scales,
+            "gemm2_scales": gemm2_scales,
+            "gemm1_scales_global": None,
+            "gemm2_scales_global": None,
+        }
+
+    def quantize_inputs(self, hidden_states, *unused_args):
+        """No scaling for hidden states."""
+        return {
+            "hidden_states": hidden_states.to(torch.bfloat16),
+            "hidden_states_scale": None,
+        }
+
+    def prepare_static_weights_for_kernel(
+        self,
+        args_dequant,
+        args,
+        gemm1_weights_orig,
+        gemm2_weights_orig,
+        hidden_size,
+        intermediate_size,
+        num_experts,
+        weight_processing,
+    ):
+        """Prepare quantized weights for kernel (done offline with weights)."""
+
+        epilogue_tile_m = 128
+        gemm1_weights_mxint4_shuffled = []
+        gemm1_scales_shuffled = []
+        gemm2_weights_mxint4_shuffled = []
+        gemm2_scales_shuffled = []
+
+        for i in range(num_experts):
+            # Calculate the permute indices for the following:
+            # 1. Reorder rows of W1 and scales for fused gated activation
+            # 2. Shuffle weights and scaling factors for transposed mma output
+            # for both w3_w1 and w2 weights and scale factors
+            permute_indices = _maybe_get_cached_w3_w1_permute_indices(
+                self._cache_permute_indices,
+                args.gemm1_weights[i].view(torch.uint8),
+                epilogue_tile_m,
+            )
+            gemm1_weights_shuffled = (
+                args.gemm1_weights[i]
+                .view(torch.uint8)[permute_indices.to(args.gemm1_weights.device)]
+                .contiguous()
+            )
+            permute_sf_indices = _maybe_get_cached_w3_w1_permute_indices(
+                self._cache_permute_indices,
+                args.gemm1_scales[i].view(torch.bfloat16),
+                epilogue_tile_m,
+                num_elts_per_sf=32,
+            )
+            gemm1_scales_shuffled.append(
+                block_scale_interleave(
+                    args.gemm1_scales[i]
+                    .view(torch.bfloat16)[
+                        permute_sf_indices.to(args.gemm1_scales.device)
+                    ]
+                    .contiguous()
+                )
+            )
+
+            permute_indices = get_w2_permute_indices_with_cache(
+                self._cache_permute_indices,
+                args.gemm2_weights[i].view(torch.uint8),
+                epilogue_tile_m,
+            )
+            gemm2_weights_shuffled = (
+                args.gemm2_weights[i]
+                .view(torch.uint8)[permute_indices.to(args.gemm2_weights.device)]
+                .contiguous()
+            )
+
+            permute_sf_indices = get_w2_permute_indices_with_cache(
+                self._cache_permute_indices,
+                args.gemm2_scales[i].view(torch.bfloat16),
+                epilogue_tile_m,
+                num_elts_per_sf=16,
+            )
+            gemm2_scales_shuffled.append(
+                block_scale_interleave(
+                    args.gemm2_scales[i]
+                    .view(torch.bfloat16)[
+                        permute_sf_indices.to(args.gemm2_scales.device)
+                    ]
+                    .contiguous()
+                )
+            )
+
+            block_k = 128
+            gemm1_weights_shuffled = convert_to_block_layout(
+                gemm1_weights_shuffled, block_k
+            )
+            gemm2_weights_shuffled = convert_to_block_layout(
+                gemm2_weights_shuffled.view(torch.uint8), block_k
+            )
+
+            gemm1_weights_mxint4_shuffled.append(gemm1_weights_shuffled)
+            gemm2_weights_mxint4_shuffled.append(gemm2_weights_shuffled)
+
+        gemm1_weights_mxint4_shuffled = torch.stack(gemm1_weights_mxint4_shuffled)
+        gemm2_weights_mxint4_shuffled = torch.stack(gemm2_weights_mxint4_shuffled)
+        gemm1_scales_shuffled = torch.stack(gemm1_scales_shuffled).view(torch.bfloat16)
+        gemm2_scales_shuffled = torch.stack(gemm2_scales_shuffled).view(torch.bfloat16)
+
+        return {
+            "gemm1_weights": gemm1_weights_mxint4_shuffled,
+            "gemm1_scales": gemm1_scales_shuffled,
+            "gemm2_weights": gemm2_weights_mxint4_shuffled,
+            "gemm2_scales": gemm2_scales_shuffled,
+        }
+
+    def call_moe(
+        self, static_data, hidden_states_orig, hidden_states_scale_global, **kwargs
+    ):
+        """Call MoE with runtime input quantization + kernel execution (done at runtime)."""
+        expert_logits = kwargs["expert_logits"]
+        num_experts = kwargs["num_experts"]
+        top_k = kwargs["top_k"]
+        n_groups = kwargs["n_groups"]
+        top_k_groups = kwargs["top_k_groups"]
+        intermediate_size = kwargs["intermediate_size"]
+        routing_method_type = kwargs["routing_method_type"]
+        enable_autotune = kwargs.get("enable_autotune", True)
+
+        # Use autotuner for optimal kernel selection
+        with autotune(enable_autotune):
+            output = trtllm_mxint4_block_scale_moe(
+                expert_logits,  # float
+                hidden_states_orig,
+                static_data["gemm1_weights"],
+                static_data["gemm1_scales"],
+                None,
+                None,
+                None,
+                static_data["gemm2_weights"],
+                static_data["gemm2_scales"],
+                num_experts,
+                top_k,
+                n_groups,
+                top_k_groups,
+                intermediate_size,
+                0,
+                num_experts,
+                1.0,
+                routing_method_type=routing_method_type,
+                tune_max_num_tokens=TUNE_MAX_NUM_TOKENS,
+            )
+        return output.to(torch.float)
+
+    def compute_reference(self, args):
+        return run_moe_reference_mxint4(args)
+
+    def get_tolerances(self):
+        """Get MXINT4-specific accuracy tolerances."""
+        return {"atol": 0.1, "rtol": 0.85, "percent": 0.925}
+
+
 # ====================================================================================
 # FP8 Block Scale Quantization Implementation
 # ====================================================================================
@@ -727,8 +933,8 @@ def prepare_static_weights_for_kernel(
                     tmp_weights2 = convert_to_block_layout(tmp_weights2, block_k)
 
                 gemm1_weights_fp8_shuffled.append(tmp_weights1)
-
                 gemm2_weights_fp8_shuffled.append(tmp_weights2)
+
             kernel_gemm1_weights = torch.stack(gemm1_weights_fp8_shuffled).view(
                 torch.float8_e4m3fn
             )
@@ -761,7 +967,7 @@ def call_moe(
         intermediate_size = kwargs["intermediate_size"]
         routed_scaling = kwargs["routed_scaling"]
         routing_method_type = kwargs["routing_method_type"]
-        tile_tokens_dim = kwargs["tile_tokens_dim"]
+        enable_autotune = kwargs.get("enable_autotune", True)
         enable_pdl = kwargs.get("enable_pdl")
         hidden_states_scale = kwargs["hidden_states_scale"]
         hidden_states_quant = kwargs["hidden_states_quant"]
@@ -772,30 +978,31 @@ def call_moe(
             "NaN detected in hidden_states_fp8"
         )
 
-        output = trtllm_fp8_block_scale_moe(
-            expert_logits,
-            routing_bias,
-            hidden_states_fp8,
-            hidden_states_scale,
-            static_data["gemm1_weights"],
-            static_data["gemm1_scales"],
-            static_data["gemm2_weights"],
-            static_data["gemm2_scales"],
-            num_experts,
-            top_k,
-            n_groups,
-            top_k_groups,
-            intermediate_size,
-            0,
-            num_experts,
-            routed_scaling,
-            tile_tokens_dim,
-            routing_method_type,
-            use_shuffled_weight=static_data["use_shuffled_weight"],
-            weight_layout=static_data["weight_layout"],
-            enable_pdl=enable_pdl,
-        )
-
+        # Use autotuner for optimal kernel selection
+        with autotune(enable_autotune):
+            output = trtllm_fp8_block_scale_moe(
+                expert_logits,
+                routing_bias,
+                hidden_states_fp8,
+                hidden_states_scale,
+                static_data["gemm1_weights"],
+                static_data["gemm1_scales"],
+                static_data["gemm2_weights"],
+                static_data["gemm2_scales"],
+                num_experts,
+                top_k,
+                n_groups,
+                top_k_groups,
+                intermediate_size,
+                0,
+                num_experts,
+                routed_scaling,
+                routing_method_type,
+                use_shuffled_weight=static_data["use_shuffled_weight"],
+                weight_layout=static_data["weight_layout"],
+                enable_pdl=enable_pdl,
+                tune_max_num_tokens=TUNE_MAX_NUM_TOKENS,
+            )
         return output.to(torch.float)
 
     def compute_reference(self, args):
@@ -937,39 +1144,41 @@ def call_moe(
         intermediate_size = kwargs["intermediate_size"]
         routed_scaling = kwargs["routed_scaling"]
         routing_method_type = kwargs["routing_method_type"]
-        tile_tokens_dim = kwargs["tile_tokens_dim"]
+        enable_autotune = kwargs.get("enable_autotune", True)
 
         # Quantize to FP8 per-tensor using pre-computed global scale factor
         hidden_states_fp8, _ = quant_fp8_per_tensor(
             hidden_states_orig, hidden_states_scale_global
         )
 
-        output = trtllm_fp8_per_tensor_scale_moe(
-            (
-                expert_logits.to(torch.bfloat16)
-                if routing_method_type == RoutingMethodType.Llama4
-                else expert_logits
-            ),
-            routing_bias,
-            hidden_states_fp8,
-            static_data["gemm1_weights"],
-            static_data["scale_c_fc1"],
-            static_data["scale_gate_fc1"],
-            static_data["gemm2_weights"],
-            static_data["scale_c_fc2"],
-            num_experts,
-            top_k,
-            n_groups,
-            top_k_groups,
-            intermediate_size,
-            0,
-            num_experts,
-            routed_scaling,
-            routing_method_type
-            == RoutingMethodType.Llama4,  # Use_routing_scales_on_input
-            tile_tokens_dim,
-            routing_method_type,
-        )
+        # Use autotuner for optimal kernel selection
+        with autotune(enable_autotune):
+            output = trtllm_fp8_per_tensor_scale_moe(
+                (
+                    expert_logits.to(torch.bfloat16)
+                    if routing_method_type == RoutingMethodType.Llama4
+                    else expert_logits
+                ),
+                routing_bias,
+                hidden_states_fp8,
+                static_data["gemm1_weights"],
+                static_data["scale_c_fc1"],
+                static_data["scale_gate_fc1"],
+                static_data["gemm2_weights"],
+                static_data["scale_c_fc2"],
+                num_experts,
+                top_k,
+                n_groups,
+                top_k_groups,
+                intermediate_size,
+                0,
+                num_experts,
+                routed_scaling,
+                routing_method_type
+                == RoutingMethodType.Llama4,  # Use_routing_scales_on_input
+                routing_method_type,
+                tune_max_num_tokens=TUNE_MAX_NUM_TOKENS,
+            )
 
         return output.to(torch.float)
 
@@ -983,10 +1192,159 @@ def get_tolerances(self):
 
 
 # ====================================================================================
-# Quantizer Factory
+# BF16 Implementation
 # ====================================================================================
 
 
+class BF16Moe(Moe):
+    """BF16 MoE implementation."""
+
+    def quantize_weights(self, gemm1_weights, gemm2_weights, hidden_states_sample):
+        """No scaling for weights."""
+        return {
+            "hidden_states_scale_global": None,
+            "gemm1_weights": gemm1_weights.to(torch.bfloat16),
+            "gemm1_scales": None,
+            "gemm1_scales_global": None,
+            "gemm2_weights": gemm2_weights.to(torch.bfloat16),
+            "gemm2_scales": None,
+            "gemm2_scales_global": None,
+        }
+
+    def quantize_inputs(self, hidden_states, *unused_args):
+        """No scaling for hidden states."""
+        return {
+            "hidden_states": hidden_states.to(torch.bfloat16),
+            "hidden_states_scale": None,
+        }
+
+    def prepare_static_weights_for_kernel(
+        self,
+        args_dequant,
+        args,
+        gemm1_weights_orig,
+        gemm2_weights_orig,
+        hidden_size,
+        intermediate_size,
+        num_experts,
+        weight_processing,
+    ):
+        """Prepare quantized weights for kernel (done offline with weights)."""
+
+        # Use shuffled weights with BlockMajorK layout for better performance
+        use_shuffled_weight = weight_processing["use_shuffled_weight"]
+        weight_layout = weight_processing["layout"]
+
+        if use_shuffled_weight:
+            # FIXME: this depends on the kernel internals
+            epilogue_tile_m = 128
+
+            # Reorder rows of W1 for fused gated activation and shuffle for both W1 and W2
+            # Using cached permute index calculation can speed up weights preprocessing
+            gemm1_weights_bf16_shuffled = []
+            gemm2_weights_bf16_shuffled = []
+            for i in range(num_experts):
+                permute_indices = _maybe_get_cached_w3_w1_permute_indices(
+                    self._cache_permute_indices,
+                    args.gemm1_weights[i].view(torch.uint8),
+                    epilogue_tile_m,
+                )
+                tmp_weights1 = (
+                    args.gemm1_weights[i]
+                    .view(torch.uint8)[permute_indices.to(args.gemm1_weights.device)]
+                    .contiguous()
+                )
+
+                permute_indices = get_w2_permute_indices_with_cache(
+                    self._cache_permute_indices,
+                    args.gemm2_weights[i].view(torch.uint8),
+                    epilogue_tile_m,
+                )
+                tmp_weights2 = (
+                    args.gemm2_weights[i]
+                    .view(torch.uint8)[permute_indices.to(args.gemm2_weights.device)]
+                    .contiguous()
+                )
+
+                if weight_layout == WeightLayout.BlockMajorK:
+                    block_k = 128
+                    tmp_weights1 = convert_to_block_layout(
+                        tmp_weights1.view(torch.uint8), block_k
+                    )
+                    tmp_weights2 = convert_to_block_layout(
+                        tmp_weights2.view(torch.uint8), block_k
+                    )
+
+                gemm1_weights_bf16_shuffled.append(tmp_weights1.view(torch.bfloat16))
+                gemm2_weights_bf16_shuffled.append(tmp_weights2.view(torch.bfloat16))
+
+            # Stack weights for all experts
+            gemm1_weights_bf16_shuffled = (
+                torch.stack(gemm1_weights_bf16_shuffled)
+                .view(torch.bfloat16)
+                .contiguous()
+            )
+            gemm2_weights_bf16_shuffled = (
+                torch.stack(gemm2_weights_bf16_shuffled)
+                .view(torch.bfloat16)
+                .contiguous()
+            )
+
+            return {
+                "gemm1_weights": gemm1_weights_bf16_shuffled,
+                "gemm2_weights": gemm2_weights_bf16_shuffled,
+                "use_shuffled_weight": use_shuffled_weight,
+                "weight_layout": weight_layout,
+            }
+
+    def call_moe(
+        self, static_data, hidden_states_orig, hidden_states_scale_global, **kwargs
+    ):
+        """Call MoE with runtime input quantization + kernel execution (done at runtime)."""
+        expert_logits = kwargs["expert_logits"]
+        routing_bias = kwargs["routing_bias"]
+        num_experts = kwargs["num_experts"]
+        top_k = kwargs["top_k"]
+        n_groups = kwargs["n_groups"]
+        top_k_groups = kwargs["top_k_groups"]
+        intermediate_size = kwargs["intermediate_size"]
+        routing_method_type = kwargs["routing_method_type"]
+        enable_autotune = kwargs.get("enable_autotune", True)
+
+        # Use autotuner for optimal kernel selection
+        with autotune(enable_autotune):
+            output = trtllm_bf16_moe(
+                expert_logits,  # float
+                routing_bias,
+                hidden_states_orig,
+                static_data["gemm1_weights"],
+                static_data["gemm2_weights"],
+                num_experts,
+                top_k,
+                n_groups,
+                top_k_groups,
+                intermediate_size,
+                0,
+                num_experts,
+                use_shuffled_weight=static_data["use_shuffled_weight"],
+                weight_layout=static_data["weight_layout"],
+                routing_method_type=routing_method_type,
+                tune_max_num_tokens=TUNE_MAX_NUM_TOKENS,
+            )
+        return output.to(torch.float)
+
+    def compute_reference(self, args):
+        """BF16 reference implementation."""
+        return run_moe_reference_bf16(args)
+
+    def get_tolerances(self):
+        """Get BF16 accuracy tolerances."""
+        return {"atol": 0.1, "rtol": 0.85, "percent": 0.925}
+
+
+# ====================================================================================
+# Quantizer Factory
+# ====================================================================================
 def get_moe_impl(quant_mode: QuantMode):
     """Factory function to get the appropriate MoE implementation."""
     if quant_mode == QuantMode.FP8_BLOCK_SCALE:
@@ -1260,23 +1618,19 @@ def routing_reference_topk(expert_logits, top_k, num_experts, padding):
 
 def check_accuracy(a, b, atol, rtol, percent):
     """Unified accuracy checking function with detailed error reporting."""
-    if torch.any(torch.isnan(a)):
-        raise Exception("NaN in reference output")
-    if torch.any(torch.isnan(b)):
-        raise Exception("NaN in actual output")
-    if torch.any(torch.isinf(a)):
-        raise Exception("Inf in reference output")
-    if torch.any(torch.isinf(b)):
-        raise Exception("Inf in actual output")
+    if not torch.isfinite(a).all():
+        raise Exception("Non-finite values in reference output")
+    if not torch.isfinite(b).all():
+        raise Exception("Non-finite values in actual output")
     assert a.shape == b.shape, f"Shape mismatch: {a.shape} vs {b.shape}"
 
-    left = torch.abs(a - b)
-    right = atol + rtol * torch.abs(b)
-    count = torch.sum(left > right)
-    mismatch_percent = count / a.numel()
+    close = torch.isclose(a, b, atol=atol, rtol=rtol)
+    match_ratio = close.float().mean()
+    if match_ratio >= percent:
+        return
+
+    mismatch_percent = 1.0 - match_ratio.item()
     if mismatch_percent > 1 - percent:
-        print(a)
-        print(b)
         raise Exception(
             f"Mismatch percentage is {mismatch_percent:.4f} for rtol {rtol} "
             f"(threshold: {1 - percent:.4f})"
@@ -1583,7 +1937,7 @@ def run_moe_dequant(args, quant_mode: QuantMode):
             .to(torch.float)
         )
         args.c_global_sf = 1.0
-    else:  # mxfp4Bf16
+    else:  # Bf16, MxFp4xBf16, MxInt4xBf16
         activation_output = activation_output.to(torch.bfloat16).to(torch.float)
         args.c_global_sf = 1.0
 
@@ -1788,23 +2142,105 @@ def run_moe_reference_per_tensor_scale_fp8(args):
     return run_moe_dequant(args_dequant, QuantMode.FP8_PER_TENSOR), args_dequant
 
 
-def _compute_moe_actual_unified(moe_impl, args_dequant, args, **kwargs):
-    """Unified actual computation that delegates to implementation-specific methods."""
-    # 1. Prepare static weights for the kernel (offline processing)
-    static_data = moe_impl.prepare_static_weights_for_kernel(
-        args_dequant,
-        args,
-        kwargs["gemm1_weights_orig"],
-        kwargs["gemm2_weights_orig"],
+def run_moe_reference_bf16(args):
+    """BF16 reference implementation."""
+
+    # no scaling for hidden states and weights
+    hidden_states_dequant = args.hidden_states.to(torch.float)
+    gemm1_weights_dequant = {}
+    for i in range(args.num_experts):
+        gemm1_weights_dequant[i] = args.gemm1_weights[i].to(torch.float)
+    gemm2_weights_dequant = {}
+    for i in range(args.num_experts):
+        gemm2_weights_dequant[i] = args.gemm2_weights[i].to(torch.float)
+
+    args_dequant = moe_args_dequant(
+        args.num_tokens,
+        args.num_experts,
         args.hidden_size,
         args.intermediate_size,
-        args.num_experts,
-        kwargs["weight_processing"],
+        args.top_k,
+        args.padding,
+        hidden_states_dequant,
+        args.expert_logits,
+        gemm1_weights_dequant,
+        gemm2_weights_dequant,
+        args.permute_info,
+        args.use_routing_scales_on_input,
+        GatedActType.SwiGlu.value,  # gated_act_type
     )
 
-    # 2. Call MoE with runtime input quantization + kernel execution
-    kernel_kwargs = {
-        "expert_logits": kwargs["expert_logits"],
+    return run_moe_dequant(args_dequant, QuantMode.BF16), args_dequant
+
+
+def run_moe_reference_mxint4(args):
+    sf_vec_size = 32
+
+    hidden_states_dequant = args.hidden_states.to(torch.bfloat16).to(torch.float)
+
+    num_experts = args.gemm1_weights.shape[0]
+
+    def dequantize(weights, scales):
+        k = weights.shape[-1] * 2
+        n = weights.shape[-2]
+        # Unpack two 4-bit values (stored in two's-complement) from each byte
+        weights_int8 = (
+            torch.stack([weights & 0x0F, (weights >> 4) & 0x0F], dim=-1)
+            .reshape(num_experts, n, k)
+            .to(torch.int8)
+        )
+
+        # Interpret nibbles as signed 4-bit two's-complement values in [-8, 7]
+        weights_int8 = torch.where(weights_int8 < 8, weights_int8, weights_int8 - 16)
+
+        weights_float = weights_int8.to(torch.float)
+        scales_expanded = (
+            scales.to(torch.bfloat16)
+            .to(torch.float)
+            .repeat_interleave(sf_vec_size, dim=-1)
+            .reshape(weights_float.shape)
+        )
+        return weights_float * scales_expanded
+
+    gemm1_weights_dequant = dequantize(args.gemm1_weights, args.gemm1_scales)
+    gemm2_weights_dequant = dequantize(args.gemm2_weights, args.gemm2_scales)
+
+    args_dequant = moe_args_dequant(
+        args.num_tokens,
+        args.num_experts,
+        args.hidden_size,
+        args.intermediate_size,
+        args.top_k,
+        args.padding,
+        hidden_states_dequant,
+        args.expert_logits,
+        gemm1_weights_dequant,
+        gemm2_weights_dequant,
+        args.permute_info,
+        args.use_routing_scales_on_input,
+        args.gated_act_type,
+    )
+
+    return run_moe_dequant(args_dequant, QuantMode.MXINT4_BF16_BF16), args_dequant
+
+
+def _compute_moe_actual_unified(moe_impl, args_dequant, args, **kwargs):
+    """Unified actual computation that delegates to implementation-specific methods."""
+    # 1. Prepare static weights for the kernel (offline processing)
+    static_data = moe_impl.prepare_static_weights_for_kernel(
+        args_dequant,
+        args,
+        kwargs["gemm1_weights_orig"],
+        kwargs["gemm2_weights_orig"],
+        args.hidden_size,
+        args.intermediate_size,
+        args.num_experts,
+        kwargs["weight_processing"],
+    )
+
+    # 2. Call MoE with runtime input quantization + kernel execution
+    kernel_kwargs = {
+        "expert_logits": kwargs["expert_logits"],
         "routing_bias": kwargs["routing_bias"],
         "num_experts": args.num_experts,
         "num_tokens": args.num_tokens,
@@ -1815,11 +2251,11 @@ def _compute_moe_actual_unified(moe_impl, args_dequant, args, **kwargs):
         "intermediate_size": args.intermediate_size,
         "routed_scaling": kwargs["routed_scaling"],
         "routing_method_type": kwargs["routing_method_type"],
-        "tile_tokens_dim": kwargs["tile_tokens_dim"],
         "do_finalize": True,
         "gated_act_type": args.gated_act_type,
         "hidden_states_scale": args.hidden_states_scale,
         "hidden_states_quant": kwargs["hidden_states_quant"],
+        "enable_autotune": kwargs.get("enable_autotune", True),
     }
 
     return moe_impl.call_moe(
@@ -1837,185 +2273,7 @@ def cache_permute_indices():
     return _cache_permute_indices
 
 
-@pytest.mark.parametrize("num_tokens", [1, 8, 1024])
-@pytest.mark.parametrize("hidden_size", [1024, 8192])
-@pytest.mark.parametrize("intermediate_size", [2048, 1024, 768, 512, 384])
-@pytest.mark.parametrize(
-    "moe_impl",
-    [
-        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_NVFP4_NVFP4), id="NvFP4xNvFP4"),
-        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_MXFP8), id="MxFP4xMxFP8"),
-        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_Bf16), id="MxFP4xBf16"),
-        pytest.param(FP8BlockScaleMoe(), id="FP8_Block"),
-        pytest.param(FP8PerTensorMoe(), id="FP8_Tensor"),
-    ],
-)
-@pytest.mark.parametrize(
-    "routing_config",
-    [
-        pytest.param(
-            {
-                "num_experts": 384,
-                "top_k": 8,
-                "padding": 8,
-                "n_groups": 1,
-                "top_k_groups": 1,
-                "routed_scaling": 2.5,
-                "has_routing_bias": True,
-                "routing_method_type": RoutingMethodType.DeepSeekV3,
-                "compatible_moe_impls": [
-                    FP4Moe,
-                    FP8BlockScaleMoe,
-                ],
-            },
-            id="kimi_k2",
-        ),
-        pytest.param(
-            {
-                "num_experts": 256,
-                "top_k": 8,
-                "padding": 8,
-                "n_groups": 8,
-                "top_k_groups": 4,
-                "routed_scaling": 2.5,
-                "has_routing_bias": True,
-                "routing_method_type": RoutingMethodType.DeepSeekV3,
-                "compatible_moe_impls": [
-                    FP4Moe,
-                    FP8BlockScaleMoe,
-                ],
-            },
-            id="DSv3",
-        ),
-        pytest.param(
-            {
-                "num_experts": 72,
-                "top_k": 6,
-                "padding": 8,
-                "n_groups": 1,
-                "top_k_groups": 1,
-                "routed_scaling": 2.5,
-                "has_routing_bias": True,
-                "routing_method_type": RoutingMethodType.DeepSeekV3,
-                "compatible_moe_impls": [
-                    FP4Moe,
-                    FP8BlockScaleMoe,
-                ],
-            },
-            id="DSLite",
-        ),
-        pytest.param(
-            {
-                "num_experts": 256,
-                "top_k": 8,
-                "padding": 8,
-                "n_groups": None,
-                "top_k_groups": None,
-                "routed_scaling": None,
-                "has_routing_bias": False,
-                "routing_method_type": RoutingMethodType.Renormalize,
-                "compatible_moe_impls": [FP8BlockScaleMoe, FP8PerTensorMoe, FP4Moe],
-            },
-            id="Renorm",
-            marks=pytest.mark.skip(
-                reason="Disabled for testing speed - similar to RenormalizeNaive"
-            ),
-        ),
-        pytest.param(
-            {
-                "num_experts": 128,
-                "top_k": 10,
-                "padding": 8,
-                "n_groups": None,
-                "top_k_groups": None,
-                "routed_scaling": None,
-                "has_routing_bias": False,
-                "routing_method_type": RoutingMethodType.Renormalize,
-                "compatible_moe_impls": [FP8BlockScaleMoe, FP4Moe],
-            },
-            id="Qwen3_next",
-        ),
-        pytest.param(
-            {
-                "num_experts": 128,
-                "top_k": 8,
-                "padding": 8,
-                "n_groups": None,
-                "top_k_groups": None,
-                "routed_scaling": None,
-                "has_routing_bias": False,
-                "routing_method_type": RoutingMethodType.RenormalizeNaive,
-                "compatible_moe_impls": [FP4Moe, FP8BlockScaleMoe],
-            },
-            id="RenormNaive",
-        ),
-        pytest.param(
-            {
-                "num_experts": 16,
-                "top_k": 2,
-                "padding": 8,
-                "n_groups": None,
-                "top_k_groups": None,
-                "routed_scaling": None,
-                "has_routing_bias": False,
-                "routing_method_type": RoutingMethodType.TopK,
-                "compatible_moe_impls": [FP4Moe],
-            },
-            id="TopK",
-        ),
-        pytest.param(
-            {
-                "num_experts": 128,
-                "top_k": 1,
-                "padding": 8,
-                "n_groups": 0,
-                "top_k_groups": 0,
-                "routed_scaling": 2.5,
-                "has_routing_bias": True,
-                "routing_method_type": RoutingMethodType.Llama4,
-                "compatible_moe_impls": [FP8PerTensorMoe],
-            },
-            id="Llama4",
-        ),
-    ],
-)
-@pytest.mark.parametrize(
-    "weight_processing",
-    [
-        pytest.param(
-            {
-                "use_shuffled_weight": False,
-                "layout": WeightLayout.MajorK,
-                "compatible_moe_impls": [FP8BlockScaleMoe],
-            },
-            id="NoShuffle_MajorK",
-        ),
-        pytest.param(
-            {
-                "use_shuffled_weight": True,
-                "layout": WeightLayout.MajorK,
-                "compatible_moe_impls": [FP4Moe, FP8PerTensorMoe, FP8BlockScaleMoe],
-            },
-            id="Shuffled_MajorK",
-        ),
-        pytest.param(
-            {
-                "use_shuffled_weight": True,
-                "layout": WeightLayout.BlockMajorK,
-                "compatible_moe_impls": [FP8BlockScaleMoe],
-            },
-            id="Shuffled_BlockMajorK",
-        ),
-    ],
-)
-@pytest.mark.parametrize(
-    "gated_act_type",
-    [
-        pytest.param(GatedActType.SwiGlu, id="SwiGlu"),
-        pytest.param(GatedActType.GeGlu, id="GeGlu"),
-    ],
-)
-def test_moe_quantization_classes(
+def run_moe_test(
     num_tokens,
     hidden_size,
     intermediate_size,
@@ -2025,62 +2283,18 @@ def test_moe_quantization_classes(
     gated_act_type,
     cache_permute_indices,
 ):
-    """
-    Test MoE implementations using separated quantization workflow.
-
-    This test demonstrates the clean separation between:
-    - Static weight quantization (done offline)
-    - Dynamic input quantization (done at runtime)
-
-    Each quantization class clearly shows which precision is being used.
-    """
-    compute_capability = get_compute_capability(torch.device(device="cuda"))
-    if compute_capability[0] not in [10]:
-        pytest.skip("These tests are only guaranteed to work on SM100 and SM103 GPUs.")
-    # Skip incompatible combinations
-    if gated_act_type == GatedActType.GeGlu and (
-        type(moe_impl) is not FP4Moe
-        or moe_impl.quant_mode != QuantMode.FP4_NVFP4_NVFP4
-        or routing_config["routing_method_type"] != RoutingMethodType.TopK
-        or num_tokens > 128
-    ):
-        # GeGlu is only supported for FP4Moe FP4_NVFP4_NVFP4 and TopK routing
-        pytest.skip(
-            f"Incompatible: {moe_impl.name} + {gated_act_type} + {routing_config['routing_method_type']} + {num_tokens}"
-        )
-    elif gated_act_type == GatedActType.SwiGlu and (
-        hidden_size > 1024 or intermediate_size > 1024
-    ):
-        # Skip some tests for SwiGlu for testing speed
-        pytest.skip(
-            f"Skip for testing speed: {gated_act_type} + {hidden_size} + {intermediate_size}"
-        )
-
-    # Skip large intermediate sizes for configurations with many experts
-    if routing_config["num_experts"] >= 512 and intermediate_size > 512:
-        pytest.skip(
-            f"Skipping for testing speed: intermediate_size={intermediate_size} with {routing_config['num_experts']} experts"
-        )
-
-    if type(moe_impl) not in routing_config["compatible_moe_impls"]:
-        pytest.skip(
-            f"Incompatible: {moe_impl.name} + {routing_config['routing_method_type'].name}"
-        )
-    if type(moe_impl) not in weight_processing["compatible_moe_impls"]:
-        pytest.skip(
-            f"Incompatible: {moe_impl.name} + {weight_processing['use_shuffled_weight']} + {weight_processing['layout']}"
-        )
+    """Common test logic for all routing methods."""
+    skip_checks(
+        moe_impl,
+        routing_config,
+        weight_processing,
+        gated_act_type,
+        num_tokens,
+        hidden_size,
+        intermediate_size,
+    )
 
-    # TODO(jimmzhou): enable MxFP4xBf16 on SM103
-    if (
-        type(moe_impl) is FP4Moe
-        and moe_impl.quant_mode == QuantMode.FP4_MXFP4_Bf16
-        and compute_capability[0] == 10
-        and compute_capability[1] == 3
-    ):
-        pytest.xfail(
-            "Note(jimmzhou): Make MxFP4xBf16 nonfunctional on SM103 to avoid B200 regression"
-        )
+    torch.cuda.synchronize()
 
     moe_impl._cache_permute_indices = cache_permute_indices
 
@@ -2096,17 +2310,6 @@ def test_moe_quantization_classes(
     num_experts = routing_config["num_experts"]
     routing_method_type = routing_config["routing_method_type"]
 
-    tile_tokens_dim = calculate_tile_tokens_dim(
-        num_tokens,
-        num_experts,
-        top_k,
-        max_tile_tokens_dim=128
-        if (
-            type(moe_impl) is FP4Moe and moe_impl.quant_mode != QuantMode.FP4_MXFP4_Bf16
-        )
-        else 64,
-    )
-
     # Validation checks
     assert top_k <= num_experts
     assert top_k <= 10
@@ -2117,15 +2320,12 @@ def test_moe_quantization_classes(
         assert num_experts % 4 == 0
         assert top_k < (top_k_groups * num_experts / n_groups)
 
-    # Create test data based on routing method and quantization mode
-    # Different kernels have different dtype requirements for routing logits
+    # Create test data based on routing method
     if routing_method_type == RoutingMethodType.DeepSeekV3:
-        # DeepSeekV3 uses float for routing logits
         expert_logits = torch.randn((num_tokens, num_experts), device="cuda").to(
             torch.float
         )
     else:
-        # Other routing methods (Renormalize, RenormalizeNaive, Llama4) use bfloat16
         expert_logits = torch.randn((num_tokens, num_experts), device="cuda").to(
             torch.bfloat16
         )
@@ -2191,12 +2391,12 @@ def test_moe_quantization_classes(
             f"Routing method {routing_method_type} not implemented"
         )
 
-    # 1. Quantize weights offline (static, done once) + compute global scale factors
+    # 1. Quantize weights offline
     weights_data = moe_impl.quantize_weights(
         gemm1_weights, gemm2_weights, hidden_states
     )
 
-    # 2. Quantize inputs at runtime (dynamic, done per inference) using pre-computed scales
+    # 2. Quantize inputs at runtime
     inputs_data = moe_impl.quantize_inputs(
         hidden_states, weights_data["hidden_states_scale_global"]
     )
@@ -2227,14 +2427,15 @@ def test_moe_quantization_classes(
         gated_act_type,
     )
 
-    # Compute reference output using the moe_impl
+    # Compute reference output
     output_dequant_reference, args_dequant = moe_impl.compute_reference(args)
 
-    # Validate that reference computation succeeded
     if output_dequant_reference is None:
         pytest.fail("Reference computation failed to produce output")
 
-    # Compute actual output using the moe_impl
+    # Compute actual output
+    enable_autotune = routing_config.get("enable_autotune", True)
+
     output_dequant_actual = moe_impl.compute_production(
         args_dequant,
         args,
@@ -2247,15 +2448,13 @@ def test_moe_quantization_classes(
         top_k_groups=top_k_groups,
         routed_scaling=routed_scaling,
         routing_method_type=routing_method_type,
-        tile_tokens_dim=tile_tokens_dim,
         weight_processing=weight_processing,
         enable_pdl=True,
-        hidden_states_quant=inputs_data[
-            "hidden_states"
-        ],  # NOTE(yingyi): only for fp8 block scale for now, refactor later
+        hidden_states_quant=inputs_data["hidden_states"],
+        enable_autotune=enable_autotune,
     )
 
-    # Compare outputs using moe_impl-specific tolerances
+    # Compare outputs
     tolerances = moe_impl.get_tolerances()
     check_accuracy(
         output_dequant_reference,
@@ -2264,3 +2463,426 @@ def test_moe_quantization_classes(
         rtol=tolerances["rtol"],
         percent=tolerances["percent"],
     )
+
+
+# Test: Renormalize routing
+@pytest.mark.parametrize("num_tokens", [8, 768, 3072])
+@pytest.mark.parametrize("hidden_size", [1024])
+@pytest.mark.parametrize("intermediate_size", [1024, 768, 512, 384])
+@pytest.mark.parametrize(
+    "moe_impl",
+    [
+        pytest.param(BF16Moe(), id="BF16xBF16"),
+        pytest.param(FP8BlockScaleMoe(), id="FP8_Block"),
+        pytest.param(FP8PerTensorMoe(), id="FP8_Tensor"),
+        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_NVFP4_NVFP4), id="NvFP4xNvFP4"),
+        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_MXFP8), id="MxFP4xMxFP8"),
+        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_Bf16), id="MxFP4xBf16"),
+        pytest.param(MxInt4BlockScaleMoe(), id="MxInt4xBf16"),
+    ],
+)
+@pytest.mark.parametrize(
+    "routing_config",
+    [
+        pytest.param(
+            {
+                "num_experts": 128,
+                "top_k": 8,
+                "padding": 8,
+                "n_groups": None,
+                "top_k_groups": None,
+                "routed_scaling": None,
+                "has_routing_bias": False,
+                "routing_method_type": RoutingMethodType.Renormalize,
+                "compatible_moe_impls": [
+                    FP8PerTensorMoe,
+                    FP8BlockScaleMoe,
+                    FP4Moe,
+                    BF16Moe,
+                    MxInt4BlockScaleMoe,
+                ],
+                "compatible_intermediate_size": [384, 768, 1024],
+                "enable_autotune": True,
+            },
+            id="Qwen3_MOE",
+        ),
+        pytest.param(
+            {
+                "num_experts": 256,
+                "top_k": 8,
+                "padding": 8,
+                "n_groups": None,
+                "top_k_groups": None,
+                "routed_scaling": None,
+                "has_routing_bias": False,
+                "routing_method_type": RoutingMethodType.Renormalize,
+                "compatible_moe_impls": [
+                    FP8PerTensorMoe,
+                    FP8BlockScaleMoe,
+                    FP4Moe,
+                    BF16Moe,
+                    MxInt4BlockScaleMoe,
+                ],
+                "compatible_intermediate_size": [384, 1024],
+                "enable_autotune": False,
+            },
+            id="Renorm",
+        ),
+        pytest.param(
+            {
+                "num_experts": 512,
+                "top_k": 10,
+                "padding": 8,
+                "n_groups": None,
+                "top_k_groups": None,
+                "routed_scaling": None,
+                "has_routing_bias": False,
+                "routing_method_type": RoutingMethodType.Renormalize,
+                "compatible_moe_impls": [
+                    FP8PerTensorMoe,
+                    FP8BlockScaleMoe,
+                    FP4Moe,
+                    BF16Moe,
+                    MxInt4BlockScaleMoe,
+                ],
+                "compatible_intermediate_size": [512],
+                "enable_autotune": True,
+            },
+            id="Qwen3_next",
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "weight_processing",
+    [
+        pytest.param(
+            {
+                "use_shuffled_weight": False,
+                "layout": WeightLayout.MajorK,
+                "compatible_moe_impls": [FP8BlockScaleMoe],
+            },
+            id="NoShuffle_MajorK",
+        ),
+        pytest.param(
+            {
+                "use_shuffled_weight": True,
+                "layout": WeightLayout.MajorK,
+                "compatible_moe_impls": [FP4Moe, FP8PerTensorMoe, FP8BlockScaleMoe],
+            },
+            id="Shuffled_MajorK",
+        ),
+        pytest.param(
+            {
+                "use_shuffled_weight": True,
+                "layout": WeightLayout.BlockMajorK,
+                "compatible_moe_impls": [
+                    FP8BlockScaleMoe,
+                    BF16Moe,
+                    MxInt4BlockScaleMoe,
+                ],
+            },
+            id="Shuffled_BlockMajorK",
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "gated_act_type",
+    [
+        pytest.param(GatedActType.SwiGlu, id="SwiGlu"),
+        pytest.param(GatedActType.GeGlu, id="GeGlu"),
+    ],
+)
+def test_renormalize_routing(
+    num_tokens,
+    hidden_size,
+    intermediate_size,
+    moe_impl,
+    routing_config,
+    weight_processing,
+    gated_act_type,
+    cache_permute_indices,
+):
+    """Test Renormalize routing configurations."""
+    run_moe_test(
+        num_tokens,
+        hidden_size,
+        intermediate_size,
+        moe_impl,
+        routing_config,
+        weight_processing,
+        gated_act_type,
+        cache_permute_indices,
+    )
+
+
+# Test: DeepSeekV3 routing
+@pytest.mark.parametrize("num_tokens", [8, 768, 3072])
+@pytest.mark.parametrize("hidden_size", [1024])
+@pytest.mark.parametrize("intermediate_size", [2048, 1024, 768, 512, 384])
+@pytest.mark.parametrize(
+    "moe_impl",
+    [
+        pytest.param(FP8BlockScaleMoe(), id="FP8_Block"),
+        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_NVFP4_NVFP4), id="NvFP4xNvFP4"),
+        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_MXFP8), id="MxFP4xMxFP8"),
+        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_Bf16), id="MxFP4xBf16"),
+    ],
+)
+@pytest.mark.parametrize(
+    "routing_config",
+    [
+        pytest.param(
+            {
+                "num_experts": 384,
+                "top_k": 8,
+                "padding": 8,
+                "n_groups": 1,
+                "top_k_groups": 1,
+                "routed_scaling": 2.5,
+                "has_routing_bias": True,
+                "routing_method_type": RoutingMethodType.DeepSeekV3,
+                "compatible_moe_impls": [FP4Moe, FP8BlockScaleMoe],
+                "compatible_intermediate_size": [1024, 2048],
+                "enable_autotune": True,
+            },
+            id="kimi_k2",
+        ),
+        pytest.param(
+            {
+                "num_experts": 256,
+                "top_k": 8,
+                "padding": 8,
+                "n_groups": 8,
+                "top_k_groups": 4,
+                "routed_scaling": 2.5,
+                "has_routing_bias": True,
+                "routing_method_type": RoutingMethodType.DeepSeekV3,
+                "compatible_moe_impls": [FP4Moe, FP8BlockScaleMoe],
+                "compatible_intermediate_size": [512, 1024, 2048],
+                "enable_autotune": True,
+            },
+            id="DSv3",
+        ),
+        pytest.param(
+            {
+                "num_experts": 72,
+                "top_k": 6,
+                "padding": 8,
+                "n_groups": 1,
+                "top_k_groups": 1,
+                "routed_scaling": 2.5,
+                "has_routing_bias": True,
+                "routing_method_type": RoutingMethodType.DeepSeekV3,
+                "compatible_moe_impls": [FP4Moe, FP8BlockScaleMoe],
+                "compatible_intermediate_size": [384, 768],
+                "enable_autotune": False,
+            },
+            id="DSLite",
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "weight_processing",
+    [
+        pytest.param(
+            {
+                "use_shuffled_weight": False,
+                "layout": WeightLayout.MajorK,
+                "compatible_moe_impls": [FP8BlockScaleMoe],
+            },
+            id="NoShuffle_MajorK",
+        ),
+        pytest.param(
+            {
+                "use_shuffled_weight": True,
+                "layout": WeightLayout.MajorK,
+                "compatible_moe_impls": [FP4Moe, FP8PerTensorMoe, FP8BlockScaleMoe],
+            },
+            id="Shuffled_MajorK",
+        ),
+        pytest.param(
+            {
+                "use_shuffled_weight": True,
+                "layout": WeightLayout.BlockMajorK,
+                "compatible_moe_impls": [FP8BlockScaleMoe],
+            },
+            id="Shuffled_BlockMajorK",
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "gated_act_type",
+    [
+        pytest.param(GatedActType.SwiGlu, id="SwiGlu"),
+        pytest.param(GatedActType.GeGlu, id="GeGlu"),
+    ],
+)
+def test_deepseekv3_routing(
+    num_tokens,
+    hidden_size,
+    intermediate_size,
+    moe_impl,
+    routing_config,
+    weight_processing,
+    gated_act_type,
+    cache_permute_indices,
+):
+    """Test DeepSeekV3 routing configurations."""
+    run_moe_test(
+        num_tokens,
+        hidden_size,
+        intermediate_size,
+        moe_impl,
+        routing_config,
+        weight_processing,
+        gated_act_type,
+        cache_permute_indices,
+    )
+
+
+# Test: TopK routing
+@pytest.mark.parametrize("num_tokens", [8, 128])  # Limited for GeGlu
+@pytest.mark.parametrize("hidden_size", [1024])
+@pytest.mark.parametrize("intermediate_size", [384, 512, 768, 1024])
+@pytest.mark.parametrize(
+    "moe_impl",
+    [
+        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_NVFP4_NVFP4), id="NvFP4xNvFP4"),
+        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_MXFP8), id="MxFP4xMxFP8"),
+    ],
+)
+@pytest.mark.parametrize(
+    "routing_config",
+    [
+        pytest.param(
+            {
+                "num_experts": 16,
+                "top_k": 2,
+                "padding": 8,
+                "n_groups": None,
+                "top_k_groups": None,
+                "routed_scaling": None,
+                "has_routing_bias": False,
+                "routing_method_type": RoutingMethodType.TopK,
+                "compatible_moe_impls": [FP4Moe],
+                "compatible_intermediate_size": [512, 768, 1024],
+                "enable_autotune": True,
+            },
+            id="TopK",
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "weight_processing",
+    [
+        pytest.param(
+            {
+                "use_shuffled_weight": True,
+                "layout": WeightLayout.MajorK,
+                "compatible_moe_impls": [FP4Moe, FP8PerTensorMoe, FP8BlockScaleMoe],
+            },
+            id="Shuffled_MajorK",
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "gated_act_type",
+    [
+        pytest.param(GatedActType.SwiGlu, id="SwiGlu"),
+        pytest.param(GatedActType.GeGlu, id="GeGlu"),
+    ],
+)
+def test_topk_routing(
+    num_tokens,
+    hidden_size,
+    intermediate_size,
+    moe_impl,
+    routing_config,
+    weight_processing,
+    gated_act_type,
+    cache_permute_indices,
+):
+    """Test TopK routing configuration."""
+    run_moe_test(
+        num_tokens,
+        hidden_size,
+        intermediate_size,
+        moe_impl,
+        routing_config,
+        weight_processing,
+        gated_act_type,
+        cache_permute_indices,
+    )
+
+
+# Test: Llama4 routing
+@pytest.mark.parametrize("num_tokens", [8, 768, 3072])
+@pytest.mark.parametrize("hidden_size", [1024])
+@pytest.mark.parametrize("intermediate_size", [1024, 2048])
+@pytest.mark.parametrize(
+    "moe_impl",
+    [
+        pytest.param(FP8PerTensorMoe(), id="FP8_Tensor"),
+    ],
+)
+@pytest.mark.parametrize(
+    "routing_config",
+    [
+        pytest.param(
+            {
+                "num_experts": 128,
+                "top_k": 1,
+                "padding": 8,
+                "n_groups": 0,
+                "top_k_groups": 0,
+                "routed_scaling": 2.5,
+                "has_routing_bias": True,
+                "routing_method_type": RoutingMethodType.Llama4,
+                "compatible_moe_impls": [FP8PerTensorMoe],
+                "compatible_intermediate_size": [1024, 2048],
+                "enable_autotune": True,
+            },
+            id="Llama4",
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "weight_processing",
+    [
+        pytest.param(
+            {
+                "use_shuffled_weight": True,
+                "layout": WeightLayout.MajorK,
+                "compatible_moe_impls": [FP4Moe, FP8PerTensorMoe, FP8BlockScaleMoe],
+            },
+            id="Shuffled_MajorK",
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "gated_act_type",
+    [
+        pytest.param(GatedActType.SwiGlu, id="SwiGlu"),
+    ],
+)
+def test_llama4_routing(
+    num_tokens,
+    hidden_size,
+    intermediate_size,
+    moe_impl,
+    routing_config,
+    weight_processing,
+    gated_act_type,
+    cache_permute_indices,
+):
+    """Test Llama4 routing configuration with FP8 per-tensor."""
+    run_moe_test(
+        num_tokens,
+        hidden_size,
+        intermediate_size,
+        moe_impl,
+        routing_config,
+        weight_processing,
+        gated_act_type,
+        cache_permute_indices,
+    )
diff --git a/tests/moe/test_trtllm_gen_routed_fused_moe.py b/tests/moe/test_trtllm_gen_routed_fused_moe.py
new file mode 100644
index 0000000000..fb3feba4b7
--- /dev/null
+++ b/tests/moe/test_trtllm_gen_routed_fused_moe.py
@@ -0,0 +1,247 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import pytest
+from typing import Literal
+import torch
+
+from flashinfer import (
+    RoutingMethodType,
+    GatedActType,
+    fp4_quantize,
+    mxfp8_quantize,
+)
+from flashinfer.fused_moe import (
+    trtllm_fp4_block_scale_moe,
+    trtllm_fp4_block_scale_routed_moe,
+)
+from flashinfer.utils import device_support_pdl
+
+from .test_trtllm_gen_fused_moe import (
+    routing_reference_renormalize,
+    routing_reference_renormalize_naive,
+    routing_reference_topk,
+)
+
+from flashinfer.utils import get_compute_capability
+
+
+@pytest.mark.parametrize("num_tokens", [1, 8, 1024])
+@pytest.mark.parametrize("hidden_size", [1024, 2048, 3072, 4096])
+@pytest.mark.parametrize("intermediate_size", [1024, 2048, 3072, 4096])
+@pytest.mark.parametrize("num_experts", [128, 256])
+@pytest.mark.parametrize("top_k", [4, 8])
+@pytest.mark.parametrize(
+    "routing_method_type",
+    [
+        RoutingMethodType.Renormalize,
+        RoutingMethodType.RenormalizeNaive,
+        RoutingMethodType.TopK,
+    ],
+)
+@pytest.mark.parametrize("quant_mode", ["NvFP4xNvFP4", "MxFP4xMxFP8", "MxFP4xBf16"])
+def test_trtllm_gen_routed_fused_moe(
+    num_tokens: int,
+    hidden_size: int,
+    intermediate_size: int,
+    top_k: int,
+    num_experts: int,
+    routing_method_type: RoutingMethodType,
+    quant_mode: Literal["NvFP4xNvFP4", "MxFP4xMxFP8", "MxFP4xBf16"],
+):
+    compute_capability = get_compute_capability(torch.device(device="cuda"))
+    if compute_capability[0] not in [10]:
+        pytest.skip("These tests are only guaranteed to work on SM100 and SM103 GPUs.")
+    torch.manual_seed(42)
+    device = torch.device("cuda:0")
+    enable_pdl = device_support_pdl(device)
+    routing_logits = torch.rand(num_tokens, num_experts, device=device).to(
+        torch.bfloat16
+    )
+    hidden_states = (
+        torch.randn(num_tokens, hidden_size, device=device).to(torch.bfloat16) * 0.1
+    )
+    if quant_mode == "NvFP4xNvFP4":
+        hidden_states, hidden_states_scale = fp4_quantize(
+            hidden_states,
+            torch.tensor([448.0 * 6.0], device=device),
+            sf_vec_size=16,
+            sf_use_ue8m0=False,
+            is_sf_swizzled_layout=False,
+        )
+        hidden_states_scale = hidden_states_scale.view(torch.float8_e4m3fn).reshape(
+            num_tokens, -1
+        )
+        hidden_states_global_scale = 1.0 / 448.0 / 6.0
+    elif quant_mode == "MxFP4xMxFP8":
+        hidden_states, hidden_states_scale = mxfp8_quantize(hidden_states, False)
+        hidden_states_scale = hidden_states_scale.view(torch.float8_e4m3fn).reshape(
+            num_tokens, -1
+        )
+        hidden_states_global_scale = 1.0
+    else:  # MxFP4xBf16
+        hidden_states_scale = None
+        hidden_states_global_scale = 1.0
+
+    w13 = (
+        torch.randn(num_experts, intermediate_size * 2, hidden_size, device=device).to(
+            torch.bfloat16
+        )
+        * 0.1
+    )
+    w2 = (
+        torch.randn(num_experts, hidden_size, intermediate_size, device=device).to(
+            torch.bfloat16
+        )
+        * 0.1
+    )
+    if quant_mode == "NvFP4xNvFP4":
+        w13, w13_scale = fp4_quantize(
+            w13,
+            torch.tensor([448.0 * 6.0], device=device),
+            sf_vec_size=16,
+            sf_use_ue8m0=False,
+        )
+        w13_scale = w13_scale.view(torch.float8_e4m3fn).reshape(
+            num_experts, intermediate_size * 2, -1
+        )
+        w2, w2_scale = fp4_quantize(
+            w2,
+            torch.tensor([448.0 * 6.0], device=device),
+            sf_vec_size=16,
+            sf_use_ue8m0=False,
+        )
+        w2_scale = w2_scale.view(torch.float8_e4m3fn).reshape(
+            num_experts, hidden_size, -1
+        )
+        w13_global_scale = 1.0 / 448.0 / 6.0
+        w2_global_scale = 1.0 / 448.0 / 6.0
+    else:
+        w13, w13_scale = fp4_quantize(
+            w13, torch.tensor([1.0], device=device), sf_vec_size=32, sf_use_ue8m0=True
+        )
+        w13_scale = w13_scale.view(torch.float8_e4m3fn).reshape(
+            num_experts, intermediate_size * 2, -1
+        )
+        w2, w2_scale = fp4_quantize(
+            w2, torch.tensor([1.0], device=device), sf_vec_size=32, sf_use_ue8m0=True
+        )
+        w2_scale = w2_scale.view(torch.float8_e4m3fn).reshape(
+            num_experts, hidden_size, -1
+        )
+        w13_global_scale = 1.0
+        w2_global_scale = 1.0
+
+    output1_scale_scalar = torch.tensor(
+        [hidden_states_global_scale * w13_global_scale] * num_experts, device=device
+    )
+    output1_scale_gate_scalar = torch.tensor(
+        [hidden_states_global_scale * w13_global_scale] * num_experts, device=device
+    )
+    output2_scale_scalar = torch.tensor(
+        [hidden_states_global_scale * w2_global_scale] * num_experts, device=device
+    )
+
+    reference_output = trtllm_fp4_block_scale_moe(
+        routing_logits,
+        None,  # routing_bias
+        hidden_states,
+        hidden_states_scale,
+        w13,
+        w13_scale,
+        None,  # w13_bias
+        None,  # gemm1_alpha
+        None,  # gemm1_beta
+        None,  # gemm1_clamp_limit
+        w2,
+        w2_scale,
+        None,  # w2_bias
+        output1_scale_scalar,
+        output1_scale_gate_scalar,
+        output2_scale_scalar,
+        num_experts,
+        top_k,
+        None,  # n_group
+        None,  # topk_group
+        intermediate_size,
+        0,  # local_expert_offset
+        num_experts,
+        None,  # routed_scaling_factor
+        routing_method_type.value,
+        True,  # do_finalize
+        enable_pdl,
+        GatedActType.SwiGlu.value,  # gated_act_type
+        None,
+    )[0].to(torch.float)
+
+    if routing_method_type == RoutingMethodType.Renormalize:
+        permute_info, expert_weights = routing_reference_renormalize(
+            routing_logits, top_k, num_experts, 8
+        )
+    elif routing_method_type == RoutingMethodType.RenormalizeNaive:
+        permute_info, expert_weights = routing_reference_renormalize_naive(
+            routing_logits, top_k, num_experts, 8
+        )
+    elif routing_method_type == RoutingMethodType.TopK:
+        permute_info, expert_weights = routing_reference_topk(
+            routing_logits, top_k, num_experts, 8
+        )
+    topk_ids = permute_info["topKIndices"].to(torch.int32)
+    expert_weights = expert_weights.view(num_tokens, num_experts)[
+        torch.arange(num_tokens).unsqueeze(1), topk_ids
+    ].to(torch.bfloat16)
+
+    packed_tensor = (topk_ids.to(torch.int32) << 16) | expert_weights.to(
+        torch.bfloat16
+    ).view(torch.int16)
+
+    output = trtllm_fp4_block_scale_routed_moe(
+        packed_tensor,
+        None,  # routing_bias
+        hidden_states,
+        hidden_states_scale,
+        w13,
+        w13_scale,
+        None,  # w13_bias
+        None,  # gemm1_alpha
+        None,  # gemm1_beta
+        None,  # gemm1_clamp_limit
+        w2,
+        w2_scale,
+        None,  # w2_bias
+        output1_scale_scalar,
+        output1_scale_gate_scalar,
+        output2_scale_scalar,
+        num_experts,
+        top_k,
+        None,  # n_group
+        None,  # topk_group
+        intermediate_size,
+        0,  # local_expert_offset
+        num_experts,
+        None,  # routed_scaling_factor
+        routing_method_type.value,
+        True,  # do_finalize
+        enable_pdl,
+        GatedActType.SwiGlu.value,  # gated_act_type
+        None,
+    )[0].to(torch.float)
+
+    mask = torch.isclose(output, reference_output, rtol=1e-3, atol=1e-3)
+
+    # mismatch percentage
+    mismatch_pct = (~mask).float().mean().item() * 100
+    assert mismatch_pct < 6, f"Mismatch percentage is {mismatch_pct:.2f}"
diff --git a/tests/moe/utils.py b/tests/moe/utils.py
new file mode 100644
index 0000000000..5a8f932117
--- /dev/null
+++ b/tests/moe/utils.py
@@ -0,0 +1,105 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import pytest
+import torch
+from enum import IntEnum
+from flashinfer import GatedActType, RoutingMethodType
+from flashinfer.utils import get_compute_capability
+
+
+class QuantMode(IntEnum):
+    """Supported quantization modes for MoE testing."""
+
+    FP4_NVFP4_NVFP4 = 1
+    FP4_MXFP4_MXFP8 = 2
+    FP4_MXFP4_Bf16 = 3
+    FP8_BLOCK_SCALE = 4
+    FP8_PER_TENSOR = 5
+    BF16 = 6
+    MXINT4_BF16_BF16 = 7
+
+
+def skip_checks(
+    moe_impl,
+    routing_config,
+    weight_processing,
+    gated_act_type,
+    num_tokens,
+    hidden_size,
+    intermediate_size,
+):
+    """Common skip logic for all tests."""
+    compute_capability = get_compute_capability(torch.device(device="cuda"))
+    if compute_capability[0] not in [10]:
+        pytest.skip("These tests are only guaranteed to work on SM100 and SM103 GPUs.")
+
+    # Check if moe_impl is FP4Moe by class name to avoid circular imports
+    is_fp4_moe = type(moe_impl).__name__ == "FP4Moe"
+
+    # Skip incompatible combinations
+    if gated_act_type == GatedActType.GeGlu and (
+        not is_fp4_moe
+        or moe_impl.quant_mode != QuantMode.FP4_NVFP4_NVFP4
+        or routing_config["routing_method_type"] != RoutingMethodType.TopK
+        or num_tokens > 128
+    ):
+        pytest.skip(
+            f"Incompatible: {moe_impl.name} + {gated_act_type} + {routing_config['routing_method_type']} + {num_tokens}"
+        )
+    elif gated_act_type == GatedActType.SwiGlu and (
+        hidden_size > 1024 or intermediate_size > 1024
+    ):
+        pytest.skip(
+            f"Skip for testing speed: {gated_act_type} + {hidden_size} + {intermediate_size}"
+        )
+
+    # Skip large intermediate sizes for configurations with many experts
+    if routing_config["num_experts"] >= 512 and intermediate_size > 512:
+        pytest.skip(
+            f"Skipping for testing speed: intermediate_size={intermediate_size} with {routing_config['num_experts']} experts"
+        )
+
+    if type(moe_impl) not in routing_config["compatible_moe_impls"]:
+        pytest.skip(
+            f"Incompatible: {moe_impl.name} + {routing_config['routing_method_type'].name}"
+        )
+    if type(moe_impl) not in weight_processing["compatible_moe_impls"]:
+        pytest.skip(
+            f"Incompatible: {moe_impl.name} + {weight_processing['use_shuffled_weight']} + {weight_processing['layout']}"
+        )
+    if intermediate_size not in routing_config["compatible_intermediate_size"]:
+        pytest.skip(
+            f"Incompatible: intermediate_size={intermediate_size} with {routing_config['routing_method_type'].name} routing ({routing_config['num_experts']} experts)"
+        )
+
+    if type(moe_impl).__name__ == "MxInt4BlockScaleMoe" and (
+        intermediate_size % 256 != 0 or hidden_size % 256 != 0
+    ):
+        pytest.skip(
+            f"Incompatible: intermediate_size={intermediate_size} or hidden_size={hidden_size} with MXINT4_BF16_BF16 quantization"
+        )
+
+    # TODO(jimmzhou): enable MxFP4xBf16 on SM103
+    if (
+        is_fp4_moe
+        and moe_impl.quant_mode == QuantMode.FP4_MXFP4_Bf16
+        and compute_capability[0] == 10
+        and compute_capability[1] == 3
+    ):
+        pytest.xfail(
+            "Note(jimmzhou): Make MxFP4xBf16 nonfunctional on SM103 to avoid B200 regression"
+        )
diff --git a/tests/utils/test_block_sparse_indices_to_vector_sparse_offsets.py b/tests/utils/test_block_sparse_indices_to_vector_sparse_offsets.py
deleted file mode 100644
index cf2ef003cc..0000000000
--- a/tests/utils/test_block_sparse_indices_to_vector_sparse_offsets.py
+++ /dev/null
@@ -1,84 +0,0 @@
-"""
-Copyright (c) 2023 by FlashInfer team.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import pytest
-import torch
-
-import flashinfer.page
-
-
-@pytest.mark.parametrize("batch_size", [1, 7, 19, 128, 517])
-@pytest.mark.parametrize("kv_len", [97, 199, 2049, 31791])
-@pytest.mark.parametrize("block_size", [1, 3, 7, 16, 64, 79, 128])
-@pytest.mark.parametrize("stride_block", [128])
-@pytest.mark.parametrize("stride_n", [1])
-def test_block_sparse_indices_to_vector_sparse_offsets(
-    batch_size, kv_len, block_size, stride_block, stride_n
-):
-    if batch_size * kv_len > 1048576:
-        pytest.skip("skip large test")
-    num_blocks_per_row = (kv_len + block_size - 1) // block_size
-
-    block_sparse_indices = torch.arange(
-        batch_size * num_blocks_per_row, device="cuda", dtype=torch.int32
-    )
-    block_sparse_indptr = torch.arange(
-        0,
-        batch_size * num_blocks_per_row + 1,
-        num_blocks_per_row,
-        device="cuda",
-        dtype=torch.int32,
-    )
-    vector_sparse_offsets_buf = torch.zeros(
-        batch_size * kv_len, device="cuda", dtype=torch.int32
-    )
-    vector_sparse_indptr = torch.arange(
-        0, batch_size * kv_len + 1, kv_len, device="cuda", dtype=torch.int32
-    )
-    kv_lens = torch.full((batch_size,), kv_len, device="cuda", dtype=torch.int32)
-
-    vector_sparse_offsets = (
-        flashinfer.page.block_sparse_indices_to_vector_sparse_offsets(
-            block_sparse_indices,
-            block_sparse_indptr,
-            vector_sparse_offsets_buf,
-            vector_sparse_indptr,
-            kv_lens,
-            stride_block,
-            stride_n,
-            block_size,
-        )
-    )
-
-    # Check that the output is correct
-    for i in range(batch_size):
-        indices_i = block_sparse_indices[
-            i * num_blocks_per_row : (i + 1) * num_blocks_per_row
-        ].cpu()
-        output_i = vector_sparse_offsets[
-            vector_sparse_indptr[i] : vector_sparse_indptr[i + 1]
-        ].cpu()
-
-        output_ref_i = (
-            indices_i[torch.arange(0, kv_len, dtype=torch.int32) // block_size]
-            * stride_block
-            + (torch.arange(0, kv_len, dtype=torch.int32) % block_size) * stride_n
-        )
-        torch.testing.assert_close(output_i, output_ref_i)
-
-
-if __name__ == "__main__":
-    pass
diff --git a/tests/utils/test_decorators.py b/tests/utils/test_decorators.py
index e0520b1d43..f8659c2e44 100644
--- a/tests/utils/test_decorators.py
+++ b/tests/utils/test_decorators.py
@@ -115,6 +115,142 @@ def my_kernel(x, backend="cudnn"):
     assert my_kernel.is_compute_capability_supported(70) is False  # neither has it
 
 
+def test_backend_requirement_empty_backends_with_common_check_cc():
+    """Test backend_requirement with empty backend_checks but common_check with compute capability."""
+
+    # Made up compute capability
+    @supported_compute_capability([42])
+    def _common_check(x):
+        # Common check with compute capability restrictions
+        return x.shape[0] <= 1024
+
+    @backend_requirement(
+        {},  # Empty backend_checks
+        common_check=_common_check,
+    )
+    def unsupported_kernel(x):
+        return x * 2
+
+    # Check methods
+    assert hasattr(unsupported_kernel, "is_backend_supported")
+    assert hasattr(unsupported_kernel, "is_compute_capability_supported")
+
+    # Check compute capability support (only common_check)
+    assert unsupported_kernel.is_compute_capability_supported(42) is True
+    assert unsupported_kernel.is_compute_capability_supported(75) is False
+
+    # The following tests are for when no backend choices are provided, where
+    # `is_backend_supported` is undefined behaviour and will raise error.
+    # We also enforce the `common_check` function when using `@backend_requirement` decorator.
+    # It must also be decorated with `@supported_compute_capability`.
+
+    # Raise error: is_backend_supported cannot be called with no backend choices.
+    for backend in [
+        ("random_backend", 42),
+        ("random_backend", 75),
+        (None, 42),
+        (None, 75),
+    ]:
+        with pytest.raises(
+            ValueError,
+            match="Invalid is_backend_supported call: no backend choices for unsupported_kernel",
+        ):
+            unsupported_kernel.is_backend_supported(backend[0], backend[1])
+
+    # Test compute capability support during kernel runtime
+    x = torch.randn(10, 10, device="cuda")
+
+    # Error: no real compute capability is supported
+    with pytest.raises(
+        BackendSupportedError, match="does not support compute capability"
+    ):
+        unsupported_kernel(x)
+
+    actual_capability = torch.cuda.get_device_capability(x.device)
+    major, minor = actual_capability
+    actual_capability = major * 10 + minor
+
+    @supported_compute_capability([actual_capability])
+    def _common_check(x):
+        return True
+
+    @backend_requirement(
+        {},
+        common_check=_common_check,
+    )
+    def supported_kernel(x):
+        return x * 2
+
+    assert supported_kernel.is_compute_capability_supported(actual_capability) is True
+
+    # Raise error: is_backend_supported cannot be called with no backend choices.
+    with pytest.raises(
+        ValueError,
+        match="Invalid is_backend_supported call: no backend choices for supported_kernel",
+    ):
+        supported_kernel.is_backend_supported(None, actual_capability)
+    assert supported_kernel.has_backend("random_backend") is False
+
+    result = supported_kernel(x)
+    assert result.shape == x.shape
+
+    # Enforce the `common_check` function to have `is_compute_capability_supported` decorator.
+    def _bad_common_check(x):
+        return True
+
+    @backend_requirement(
+        {},
+        common_check=_bad_common_check,
+    )
+    def bad_kernel(x):
+        return x * 2
+
+    with pytest.raises(
+        ValueError,
+        match="Invalid is_compute_capability_supported call: _bad_common_check does not have is_compute_capability_supported decorator",
+    ):
+        bad_kernel.is_compute_capability_supported(42)
+
+    # Enforce `common_check` function in @backend_requirement decorator.
+    @backend_requirement({})
+    def kernel_no_common_check(x):
+        return x * 2
+
+    with pytest.raises(
+        ValueError,
+        match="Invalid @backend_requirement decorator usage: no backend choices and no common_check for kernel_no_common_check",
+    ):
+        x = torch.randn(10, 10, device="cuda")
+        kernel_no_common_check(x)
+
+
+def test_has_backend():
+    """Test the has_backend method."""
+
+    @backend_requirement({"cudnn": lambda x: True, "cutlass": lambda x: True})
+    def my_kernel(x, backend="cudnn"):
+        return x * 2
+
+    assert my_kernel.has_backend("cudnn") is True
+    assert my_kernel.has_backend("cutlass") is True
+    assert my_kernel.has_backend("random_backend") is False
+
+
+def test_has_backend_choices():
+    """Test the has_backend_choices method."""
+
+    @backend_requirement({"cudnn": lambda x: True, "cutlass": lambda x: True})
+    def my_kernel(x, backend="cudnn"):
+        return x * 2
+
+    @backend_requirement({})
+    def my_kernel_no_backend(x):
+        return x * 2
+
+    assert my_kernel.has_backend_choices() is True
+    assert my_kernel_no_backend.has_backend_choices() is False
+
+
 def test_backend_requirement_wrapped_function():
     """Test the backend_requirement decorator's wrapped function."""
     if not torch.cuda.is_available():
@@ -191,6 +327,117 @@ def my_kernel(x, backend="cudnn"):
         my_kernel(x_3d, backend="cudnn")
 
 
+def test_suitable_auto_backends():
+    """Test the suitable_auto_backends method."""
+    if not torch.cuda.is_available():
+        pytest.skip("Skipping CUDA tests (no GPU available)")
+
+    x = torch.randn(1, 1, device="cuda")
+    major, minor = torch.cuda.get_device_capability(x.device)
+    actual_capability = major * 10 + minor
+
+    @supported_compute_capability([80, 86, 89, 90, actual_capability])
+    def _cutlass_check(x, backend):
+        return x.shape[0] > 10
+
+    @supported_compute_capability([75, 80, 86, 89, 90, actual_capability])
+    def _cudnn_check(x, backend):
+        return x.shape[0] > 5
+
+    # When using an auto backend, some heuristic function must exist
+    def _heuristic_func(suitable_backends, x, backend):
+        candidate_backends = None
+        if x.shape[0] > 5:
+            candidate_backends = ["cudnn", "cutlass"]
+        else:
+            candidate_backends = ["cutlass", "cudnn"]
+
+        heuristic_backends = []
+        for backend in candidate_backends:
+            if backend in suitable_backends:
+                heuristic_backends.append(backend)
+        return heuristic_backends
+
+    @backend_requirement(
+        backend_checks={
+            "cutlass": _cutlass_check,
+            "cudnn": _cudnn_check,
+        },
+        heuristic_func=_heuristic_func,
+    )
+    def my_kernel(x, backend="auto"):
+        backends = my_kernel.suitable_auto_backends
+        if x.shape[0] > 5:
+            assert "cudnn" in backends
+        if x.shape[0] > 10:
+            assert "cutlass" in backends
+        return x * 2
+
+    x = torch.randn(6, 10, device="cuda")
+    result = my_kernel(x, backend="auto")
+    assert result.shape == x.shape
+
+    with pytest.raises(
+        BackendSupportedError, match="No suitable auto backends found for my_kernel"
+    ):
+        x = torch.randn(1, 1, device="cuda")
+        my_kernel(x, backend="auto")
+
+
+def test_heuristic_func():
+    """Test the heuristic_func parameter."""
+    if not torch.cuda.is_available():
+        pytest.skip("Skipping CUDA tests (no GPU available)")
+
+    x = torch.randn(1, 1, device="cuda")
+    major, minor = torch.cuda.get_device_capability(x.device)
+    actual_capability = major * 10 + minor
+
+    @supported_compute_capability([80, 86, 89, 90, actual_capability])
+    def _cutlass_check(x, backend):
+        return x.shape[0] > 10
+
+    @supported_compute_capability([75, 80, 86, 89, 90, actual_capability])
+    def _cudnn_check(x, backend):
+        return x.shape[0] > 5
+
+    @supported_compute_capability([75, 80, 86, 89, 90, actual_capability])
+    def _trtllm_check(x, backend):
+        return x.shape[0] > 0
+
+    def _heuristic_func(suitable_backends, x, backend):
+        # Cutlass fails check
+        assert "cutlass" not in suitable_backends
+
+        # Example: out of the supported backends in suitable_backends,
+        # cudnn is preferred over trtllm when shape[0] > 5
+        if x.shape[0] > 5:
+            return ["cudnn", "trtllm"]
+        else:
+            return ["trtllm", "cudnn"]
+
+    @backend_requirement(
+        {"cutlass": _cutlass_check, "cudnn": _cudnn_check, "trtllm": _trtllm_check},
+        heuristic_func=_heuristic_func,
+    )
+    def my_kernel(x, backend="auto"):
+        if x.shape[0] > 5:
+            assert my_kernel.suitable_auto_backends[0] == "cudnn"
+            assert my_kernel.suitable_auto_backends[1] == "trtllm"
+        else:
+            assert my_kernel.suitable_auto_backends[0] == "trtllm"
+            assert my_kernel.suitable_auto_backends[1] == "cudnn"
+        return x * 2
+
+    x = torch.randn(8, 10, device="cuda")
+    result = my_kernel(x, backend="auto")
+    assert result.shape == x.shape
+
+    x = torch.randn(2, 10, device="cuda")
+    result = my_kernel(x, backend="auto")
+    assert result.shape == x.shape
+
+
 def test_functools_wraps_preserves_metadata():
     """Test that backend_requirement preserves function metadata with functools.wraps."""
 
@@ -210,3 +457,39 @@ def my_documented_function(x, backend="backend"):
     # Verify that added methods still exist
     assert hasattr(my_documented_function, "is_backend_supported")
     assert hasattr(my_documented_function, "is_compute_capability_supported")
+
+
+def test_backend_default_parameter():
+    """Test that backend_requirement correctly uses default backend parameter when not specified."""
+    if not torch.cuda.is_available():
+        pytest.skip("Skipping CUDA tests (no GPU available)")
+
+    # Get actual device capability
+    x = torch.randn(1, 1, device="cuda")
+    major, minor = torch.cuda.get_device_capability(x.device)
+    actual_capability = major * 10 + minor
+
+    @supported_compute_capability([80, 86, 89, 90, actual_capability])
+    def _cutlass_check(x, backend):
+        return x.shape[0] > 0
+
+    @supported_compute_capability([75, 80, 86, 89, 90, actual_capability])
+    def _cudnn_check(x, backend):
+        return x.shape[0] > 0
+
+    @backend_requirement({"cutlass": _cutlass_check, "cudnn": _cudnn_check})
+    def my_kernel(x, backend="cudnn"):
+        return x * 2
+
+    x = torch.randn(10, 10, device="cuda")
+
+    # Test that calling without backend argument uses the default "cudnn"
+    # This should work without raising an error
+    result = my_kernel(x)
+    assert result.shape == x.shape
+    assert torch.allclose(result, x * 2)
+
+    # Test that explicitly passing a different backend also works
+    result2 = my_kernel(x, backend="cutlass")
+    assert result2.shape == x.shape
+    assert torch.allclose(result2, x * 2)
diff --git a/tests/utils/test_fp8_quantize.py b/tests/utils/test_fp8_quantize.py
index 50352eacc1..a9fe4c41c7 100644
--- a/tests/utils/test_fp8_quantize.py
+++ b/tests/utils/test_fp8_quantize.py
@@ -2,6 +2,7 @@
 import torch
 
 from flashinfer import mxfp8_dequantize_host, mxfp8_quantize
+from flashinfer.utils import get_compute_capability
 
 
 @pytest.mark.parametrize("m", [1, 1024])
@@ -10,6 +11,13 @@
 @pytest.mark.parametrize("is_sf_swizzled_layout", [True, False])
 @pytest.mark.parametrize("device", ["cuda", "cpu"])
 def test_mxfp8_quantize_torch(m, k, dtype, is_sf_swizzled_layout, device):
+    if device == "cuda":
+        major, _ = get_compute_capability(torch.device(device))
+        if major < 10:
+            pytest.skip(
+                "mxfp8 quantization is not supported on compute capability < 10"
+            )
+
     a = 16 * torch.randn([m, k], dtype=dtype).to(device).contiguous()
 
     if device == "cpu":
@@ -90,6 +98,10 @@ def test_mxfp8_quantize_torch_host(m, k, dtype, is_sf_swizzled_layout):
 @pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
 @pytest.mark.parametrize("is_sf_swizzled_layout", [True, False])
 def test_mxfp8_quantize_torch_device(m, k, dtype, is_sf_swizzled_layout):
+    major, _ = get_compute_capability(torch.device("cuda:0"))
+    if major < 10:
+        pytest.skip("mxfp8 quantization is not supported on compute capability < 10")
+
     torch.random.manual_seed(0)
     a = (torch.randn([m, k], dtype=torch.float) * 16).to(dtype).cuda().contiguous()
 
@@ -114,6 +126,10 @@ def test_mxfp8_quantize_torch_device(m, k, dtype, is_sf_swizzled_layout):
 def test_mxfp8_quantize_alignment_torch_device(
     m, k, dtype, is_sf_swizzled_layout, alignment
 ):
+    major, _ = get_compute_capability(torch.device("cuda:0"))
+    if major < 10:
+        pytest.skip("mxfp8 quantization is not supported on compute capability < 10")
+
     torch.random.manual_seed(0)
     a = (torch.randn([m, k], dtype=torch.float) * 16).to(dtype).cuda().contiguous()
     padded_k = ((k + alignment - 1) // alignment) * alignment
diff --git a/tests/utils/test_green_ctx.py b/tests/utils/test_green_ctx.py
index 4863dd5c51..99d6dc97bc 100644
--- a/tests/utils/test_green_ctx.py
+++ b/tests/utils/test_green_ctx.py
@@ -12,14 +12,30 @@ def test_green_ctx_creation(
     num_groups: int,
     min_count: int,
 ):
-    streams, resources = green_ctx.split_device_green_ctx(
-        torch.device(device), num_groups, min_count
-    )
+    try:
+        streams, resources = green_ctx.split_device_green_ctx(
+            torch.device(device), num_groups, min_count
+        )
 
-    assert len(resources) == num_groups + 1
-    for resource in resources[:-1]:
-        sm_count = resource.sm.smCount
-        assert sm_count >= min_count
+        assert len(resources) == num_groups + 1
+        for resource in resources[:-1]:
+            sm_count = resource.sm.smCount
+            assert sm_count >= min_count
+    except RuntimeError as e:
+        if (
+            "CUDA error code=914" in str(e)
+            or "CUDA_ERROR_INVALID_RESOURCE_TYPE" in str(e)
+            or "CUDA error code=915" in str(e)
+            or "CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION" in str(e)
+        ):
+            # Get total SM count on the device
+            cu_dev = green_ctx.get_cudevice(torch.device(device))
+            device_resource = green_ctx.get_device_resource(cu_dev)
+            total_sms = device_resource.sm.smCount
+            pytest.skip(
+                f"Insufficient SMs on device. Total SMs available: {total_sms}, requested: num_groups={num_groups}, min_count={min_count}"
+            )
+        raise
 
 
 @pytest.mark.parametrize("device", ["cuda:0"])
@@ -30,19 +46,35 @@ def test_green_ctx_kernel_execution(
     num_groups: int,
     min_count: int,
 ):
-    streams, resources = green_ctx.split_device_green_ctx(
-        torch.device(device), num_groups, min_count
-    )
-    num_partitions = num_groups + 1
-    assert len(streams) == num_partitions
-    assert len(resources) == num_partitions
-
-    for stream in streams:
-        with torch.cuda.stream(stream):
-            x = torch.randn(8192, 8192, device=device, dtype=torch.bfloat16)
-            y = torch.randn(8192, 8192, device=device, dtype=torch.bfloat16)
-            z = x @ y
-            print(z.shape)
+    try:
+        streams, resources = green_ctx.split_device_green_ctx(
+            torch.device(device), num_groups, min_count
+        )
+        num_partitions = num_groups + 1
+        assert len(streams) == num_partitions
+        assert len(resources) == num_partitions
+
+        for stream in streams:
+            with torch.cuda.stream(stream):
+                x = torch.randn(8192, 8192, device=device, dtype=torch.bfloat16)
+                y = torch.randn(8192, 8192, device=device, dtype=torch.bfloat16)
+                z = x @ y
+                print(z.shape)
+    except RuntimeError as e:
+        if (
+            "CUDA error code=914" in str(e)
+            or "CUDA_ERROR_INVALID_RESOURCE_TYPE" in str(e)
+            or "CUDA error code=915" in str(e)
+            or "CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION" in str(e)
+        ):
+            # Get total SM count on the device
+            cu_dev = green_ctx.get_cudevice(torch.device(device))
+            device_resource = green_ctx.get_device_resource(cu_dev)
+            total_sms = device_resource.sm.smCount
+            pytest.skip(
+                f"Insufficient SMs on device. Total SMs available: {total_sms}, requested: num_groups={num_groups}, min_count={min_count}"
+            )
+        raise
 
 
 @pytest.mark.parametrize("device", ["cuda:0"])
@@ -59,17 +91,33 @@ def test_split_device_green_ctx_by_sm_count_creation(
     device: str,
     sm_counts: list,
 ):
-    streams, resources = green_ctx.split_device_green_ctx_by_sm_count(
-        torch.device(device), sm_counts
-    )
-    num_partitions = len(sm_counts) + 1
-    assert len(resources) == num_partitions
-    assert len(streams) == num_partitions
-
-    # Check that each partition has the expected SM count
-    for i, expected_sm_count in enumerate(sm_counts):
-        actual_sm_count = resources[i].sm.smCount
-        assert actual_sm_count >= expected_sm_count
+    try:
+        streams, resources = green_ctx.split_device_green_ctx_by_sm_count(
+            torch.device(device), sm_counts
+        )
+        num_partitions = len(sm_counts) + 1
+        assert len(resources) == num_partitions
+        assert len(streams) == num_partitions
+
+        # Check that each partition has the expected SM count
+        for i, expected_sm_count in enumerate(sm_counts):
+            actual_sm_count = resources[i].sm.smCount
+            assert actual_sm_count >= expected_sm_count
+    except RuntimeError as e:
+        if (
+            "CUDA error code=914" in str(e)
+            or "CUDA_ERROR_INVALID_RESOURCE_TYPE" in str(e)
+            or "CUDA error code=915" in str(e)
+            or "CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION" in str(e)
+        ):
+            # Get total SM count on the device
+            cu_dev = green_ctx.get_cudevice(torch.device(device))
+            device_resource = green_ctx.get_device_resource(cu_dev)
+            total_sms = device_resource.sm.smCount
+            pytest.skip(
+                f"Insufficient SMs on device. Total SMs available: {total_sms}, requested SM counts: {sm_counts}"
+            )
+        raise
 
 
 @pytest.mark.parametrize("device", ["cuda:0"])
@@ -85,19 +133,35 @@ def test_split_device_green_ctx_by_sm_count_kernel_execution(
     device: str,
     sm_counts: list,
 ):
-    streams, resources = green_ctx.split_device_green_ctx_by_sm_count(
-        torch.device(device), sm_counts
-    )
-    num_partitions = len(sm_counts) + 1
-    assert len(streams) == num_partitions
-    assert len(resources) == num_partitions
-
-    for i, stream in enumerate(streams):
-        with torch.cuda.stream(stream):
-            x = torch.randn(4096, 4096, device=device, dtype=torch.bfloat16)
-            y = torch.randn(4096, 4096, device=device, dtype=torch.bfloat16)
-            z = x @ y
-            print(f"Partition {i}: {z.shape}")
+    try:
+        streams, resources = green_ctx.split_device_green_ctx_by_sm_count(
+            torch.device(device), sm_counts
+        )
+        num_partitions = len(sm_counts) + 1
+        assert len(streams) == num_partitions
+        assert len(resources) == num_partitions
+
+        for i, stream in enumerate(streams):
+            with torch.cuda.stream(stream):
+                x = torch.randn(4096, 4096, device=device, dtype=torch.bfloat16)
+                y = torch.randn(4096, 4096, device=device, dtype=torch.bfloat16)
+                z = x @ y
+                print(f"Partition {i}: {z.shape}")
+    except RuntimeError as e:
+        if (
+            "CUDA error code=914" in str(e)
+            or "CUDA_ERROR_INVALID_RESOURCE_TYPE" in str(e)
+            or "CUDA error code=915" in str(e)
+            or "CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION" in str(e)
+        ):
+            # Get total SM count on the device
+            cu_dev = green_ctx.get_cudevice(torch.device(device))
+            device_resource = green_ctx.get_device_resource(cu_dev)
+            total_sms = device_resource.sm.smCount
+            pytest.skip(
+                f"Insufficient SMs on device. Total SMs available: {total_sms}, requested SM counts: {sm_counts}"
+            )
+        raise
 
 
 @pytest.mark.parametrize("device", ["cuda:0"])
@@ -113,16 +177,32 @@ def test_split_device_green_ctx_by_sm_count_alignment(
     device: str,
     sm_counts: list,
 ):
-    _, resources = green_ctx.split_device_green_ctx_by_sm_count(
-        torch.device(device), sm_counts
-    )
-
-    for resource in resources[:-1]:  # Exclude remaining SMs
-        sm_count = resource.sm.smCount
-        assert sm_count > 0
-
-        min_sm_count, sm_alignment = green_ctx.get_sm_count_constraint(
-            *green_ctx.get_compute_capability(torch.device(device))
+    try:
+        _, resources = green_ctx.split_device_green_ctx_by_sm_count(
+            torch.device(device), sm_counts
         )
-        assert sm_count >= min_sm_count
-        assert sm_count % sm_alignment == 0
+
+        for resource in resources[:-1]:  # Exclude remaining SMs
+            sm_count = resource.sm.smCount
+            assert sm_count > 0
+
+            min_sm_count, sm_alignment = green_ctx.get_sm_count_constraint(
+                *green_ctx.get_compute_capability(torch.device(device))
+            )
+            assert sm_count >= min_sm_count
+            assert sm_count % sm_alignment == 0
+    except RuntimeError as e:
+        if (
+            "CUDA error code=914" in str(e)
+            or "CUDA_ERROR_INVALID_RESOURCE_TYPE" in str(e)
+            or "CUDA error code=915" in str(e)
+            or "CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION" in str(e)
+        ):
+            # Get total SM count on the device
+            cu_dev = green_ctx.get_cudevice(torch.device(device))
+            device_resource = green_ctx.get_device_resource(cu_dev)
+            total_sms = device_resource.sm.smCount
+            pytest.skip(
+                f"Insufficient SMs on device. Total SMs available: {total_sms}, requested SM counts: {sm_counts}"
+            )
+        raise
diff --git a/tests/utils/test_jit_example.py b/tests/utils/test_jit_example.py
index fb169f1a7f..959f303914 100644
--- a/tests/utils/test_jit_example.py
+++ b/tests/utils/test_jit_example.py
@@ -11,7 +11,7 @@
     gen_customize_single_prefill_module,
 )
 from flashinfer.prefill import single_prefill_with_kv_cache_with_jit_module
-from flashinfer.utils import MaskMode, is_sm90a_supported
+from flashinfer.utils import MaskMode, is_sm90a_supported, get_compute_capability
 
 
 def test_single_decode_mask():
@@ -166,6 +166,10 @@ def test_flash_sigmoid():
     torch.testing.assert_close(o, o_ref, rtol=2e-2, atol=2e-2)
 
 
+@pytest.mark.xfail(
+    get_compute_capability(torch.device("cuda:0")) == (12, 1),
+    reason="Numerical accuracy issue on SM 121 (Spark)",
+)
 def test_dump_logits():
     torch.manual_seed(42)
     variant_decl = r"""
diff --git a/tests/utils/test_jit_warmup.py b/tests/utils/test_jit_warmup.py
index cd5c430664..f280b7915a 100644
--- a/tests/utils/test_jit_warmup.py
+++ b/tests/utils/test_jit_warmup.py
@@ -15,6 +15,7 @@
 """
 
 import torch
+import pytest
 
 import flashinfer
 from flashinfer.utils import PosEncodingMode
@@ -57,6 +58,10 @@ def test_warmpup_llama():
     )
 
 
+@pytest.mark.skipif(
+    torch.cuda.get_device_capability()[0] != 9,
+    reason="fa3 backend is only supported on SM90",
+)
 def test_warmpup_llama_sm90():
     flashinfer.jit.build_jit_specs(
         [
diff --git a/tests/utils/test_logging.py b/tests/utils/test_logging.py
new file mode 100644
index 0000000000..6ead5e7d6b
--- /dev/null
+++ b/tests/utils/test_logging.py
@@ -0,0 +1,588 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import sys
+import tempfile
+from enum import Enum
+from pathlib import Path
+
+import pytest
+import torch
+
+
+# Test enum classes
+class TestEnum(Enum):
+    """Test enum with integer values."""
+
+    OPTION_A = 0
+    OPTION_B = 1
+    OPTION_C = 2
+
+
+class StringEnum(Enum):
+    """Test enum with string values. Names are for testing purposes."""
+
+    MODE_STANDARD = "standard"
+    MODE_OPTIMIZED = "optimized"
+
+
+class TestAPILogging:
+    """Test suite for FlashInfer API logging infrastructure."""
+
+    @pytest.fixture(autouse=True)
+    def setup_and_teardown(self):
+        """Reset environment and reimport logging module for each test."""
+        # Store original environment
+        original_level = os.environ.get("FLASHINFER_LOGLEVEL")
+        original_dest = os.environ.get("FLASHINFER_LOGDEST")
+
+        yield
+
+        # Restore original environment
+        if original_level is not None:
+            os.environ["FLASHINFER_LOGLEVEL"] = original_level
+        elif "FLASHINFER_LOGLEVEL" in os.environ:
+            del os.environ["FLASHINFER_LOGLEVEL"]
+
+        if original_dest is not None:
+            os.environ["FLASHINFER_LOGDEST"] = original_dest
+        elif "FLASHINFER_LOGDEST" in os.environ:
+            del os.environ["FLASHINFER_LOGDEST"]
+
+        # Force reimport to pick up new environment variables
+        if "flashinfer.api_logging" in sys.modules:
+            del sys.modules["flashinfer.api_logging"]
+
+    def setup_logging(self, level: int, dest: str = "stdout"):
+        """Helper to set up logging environment and reimport."""
+        os.environ["FLASHINFER_LOGLEVEL"] = str(level)
+        os.environ["FLASHINFER_LOGDEST"] = dest
+
+        # Force reimport
+        if "flashinfer.api_logging" in sys.modules:
+            del sys.modules["flashinfer.api_logging"]
+
+        from flashinfer.api_logging import flashinfer_api
+
+        return flashinfer_api
+
+    def test_level_0_zero_overhead(self):
+        """Test that level 0 has truly zero overhead (returns original function)."""
+        decorator = self.setup_logging(level=0)
+
+        def original_func(x, y):
+            return x + y
+
+        decorated_func = decorator(original_func)
+
+        # At level 0, decorator should return the original function unchanged
+        assert decorated_func is original_func
+        assert decorated_func(5, 3) == 8
+
+    def test_level_1_function_name(self):
+        """Test that level 1 logs function name only."""
+        with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as f:
+            log_file = f.name
+
+        try:
+            decorator = self.setup_logging(level=1, dest=log_file)
+
+            @decorator
+            def test_function(x, y):
+                return x + y
+
+            result = test_function(10, 20)
+            assert result == 30
+
+            # Check log contents
+            with open(log_file, "r") as f:
+                log_contents = f.read()
+
+            assert "FlashInfer API Call: test_function" in log_contents
+            # Level 1 should not log inputs/outputs details
+            assert "Positional input arguments" not in log_contents
+            assert "Output value" not in log_contents
+        finally:
+            Path(log_file).unlink(missing_ok=True)
+
+    def test_level_3_inputs_outputs(self):
+        """Test that level 3 logs inputs and outputs with metadata."""
+        with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as f:
+            log_file = f.name
+
+        try:
+            decorator = self.setup_logging(level=3, dest=log_file)
+
+            @decorator
+            def test_function(tensor, value):
+                return tensor * value
+
+            tensor = torch.tensor([1.0, 2.0, 3.0])
+            test_function(tensor, 2.0)
+
+            # Check log contents
+            with open(log_file, "r") as f:
+                log_contents = f.read()
+
+            # Should log function name
+            assert "FlashInfer API Call: test_function" in log_contents
+
+            # Should log inputs
+            assert "Positional input arguments" in log_contents
+            assert "arg[0]" in log_contents
+            assert "Tensor(" in log_contents
+            assert "shape=(3,)" in log_contents
+            assert "dtype=torch.float32" in log_contents
+
+            # Should log outputs
+            assert "Output value:" in log_contents
+
+            # Should NOT log statistics (level 5 only)
+            assert "min=" not in log_contents
+            assert "max=" not in log_contents
+        finally:
+            Path(log_file).unlink(missing_ok=True)
+
+    def test_level_5_statistics(self):
+        """Test that level 5 logs tensor statistics."""
+        with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as f:
+            log_file = f.name
+
+        try:
+            decorator = self.setup_logging(level=5, dest=log_file)
+
+            @decorator
+            def test_function(tensor):
+                return tensor + 1.0
+
+            tensor = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0])
+            test_function(tensor)
+
+            # Check log contents
+            with open(log_file, "r") as f:
+                log_contents = f.read()
+
+            # Should log statistics
+            assert "min=" in log_contents
+            assert "max=" in log_contents
+            assert "mean=" in log_contents
+            assert "nan_count=" in log_contents
+            assert "inf_count=" in log_contents
+        finally:
+            Path(log_file).unlink(missing_ok=True)
+
+    def test_enum_logging(self):
+        """Test that enum values are logged with name and value."""
+        with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as f:
+            log_file = f.name
+
+        try:
+            decorator = self.setup_logging(level=3, dest=log_file)
+
+            @decorator
+            def test_function(mode: TestEnum, strategy: StringEnum):
+                return f"{mode.name}_{strategy.name}"
+
+            test_function(TestEnum.OPTION_B, StringEnum.MODE_OPTIMIZED)
+
+            # Check log contents
+            with open(log_file, "r") as f:
+                log_contents = f.read()
+
+            # Should show enum name and value
+            assert "TestEnum.OPTION_B" in log_contents
+            assert "(value=1)" in log_contents
+            assert "StringEnum.MODE_OPTIMIZED" in log_contents
+            assert (
+                "(value=optimized)" in log_contents
+                or "(value='optimized')" in log_contents
+                or '(value="optimized")' in log_contents
+            )
+        finally:
+            Path(log_file).unlink(missing_ok=True)
+
+    def test_default_parameters(self):
+        """Test that default parameters are logged separately."""
+        with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as f:
+            log_file = f.name
+
+        try:
+            decorator = self.setup_logging(level=3, dest=log_file)
+
+            @decorator
+            def test_function(x, y=10, z=20, mode=TestEnum.OPTION_A):
+                return x + y + z
+
+            # Call with only required argument
+            result = test_function(5)
+            assert result == 35
+
+            # Check log contents
+            with open(log_file, "r") as f:
+                log_contents = f.read()
+
+            # Should show default parameters section
+            assert "Default parameters (not explicitly provided)" in log_contents
+            assert "[DEFAULT]" in log_contents
+
+            # Should show the default values
+            assert "y=" in log_contents
+            assert "z=" in log_contents
+            assert "mode=" in log_contents
+        finally:
+            Path(log_file).unlink(missing_ok=True)
+
+    def test_explicit_vs_default_parameters(self):
+        """Test that explicitly provided parameters are not shown in defaults."""
+        with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as f:
+            log_file = f.name
+
+        try:
+            decorator = self.setup_logging(level=3, dest=log_file)
+
+            @decorator
+            def test_function(x, y=10, z=20):
+                return x + y + z
+
+            # Call with some explicit parameters
+            test_function(5, y=100)
+
+            # Check log contents
+            with open(log_file, "r") as f:
+                log_contents = f.read()
+
+            # y should be in keyword arguments (explicit)
+            assert "Keyword input arguments:" in log_contents
+
+            # Only z should be in defaults
+            lines = log_contents.split("\n")
+            default_section_started = False
+            defaults_found = []
+            for line in lines:
+                if "Default parameters" in line:
+                    default_section_started = True
+                if default_section_started and "=" in line and "[DEFAULT]" in line:
+                    defaults_found.append(line)
+
+            # Should have only one default parameter (z)
+            assert len(defaults_found) == 1
+            assert "z=" in defaults_found[0]
+        finally:
+            Path(log_file).unlink(missing_ok=True)
+
+    def test_class_method_logging(self):
+        """Test that class methods log with class name."""
+        with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as f:
+            log_file = f.name
+
+        try:
+            decorator = self.setup_logging(level=1, dest=log_file)
+
+            class TestWrapper:
+                @decorator
+                def run(self, x):
+                    return x * 2
+
+            wrapper = TestWrapper()
+            result = wrapper.run(5)
+            assert result == 10
+
+            # Check log contents
+            with open(log_file, "r") as f:
+                log_contents = f.read()
+
+            # Should log class name for Wrapper classes
+            assert "TestWrapper.run" in log_contents
+        finally:
+            Path(log_file).unlink(missing_ok=True)
+
+    def test_crash_safety_inputs_logged_before_execution(self):
+        """Test that inputs are logged BEFORE execution (crash-safe)."""
+        with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as f:
+            log_file = f.name
+
+        try:
+            decorator = self.setup_logging(level=3, dest=log_file)
+
+            @decorator
+            def crashing_function(x, y):
+                raise RuntimeError("Simulated crash")
+
+            # Call the function and expect it to crash
+            with pytest.raises(RuntimeError, match="Simulated crash"):
+                crashing_function(42, 99)
+
+            # Check that inputs were still logged
+            with open(log_file, "r") as f:
+                log_contents = f.read()
+
+            # Inputs should be in the log even though function crashed
+            assert "FlashInfer API Call: crashing_function" in log_contents
+            assert "Positional input arguments" in log_contents
+            assert "arg[0]" in log_contents
+            assert "42" in log_contents
+            assert "arg[1]" in log_contents
+            assert "99" in log_contents
+
+            # Outputs should NOT be in the log (function crashed)
+            assert "Output value:" not in log_contents
+        finally:
+            Path(log_file).unlink(missing_ok=True)
+
+    def test_different_data_types(self):
+        """Test logging of various data types."""
+        with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as f:
+            log_file = f.name
+
+        try:
+            decorator = self.setup_logging(level=3, dest=log_file)
+
+            @decorator
+            def test_function(
+                int_val,
+                float_val,
+                bool_val,
+                str_val,
+                list_val,
+                tuple_val,
+                dict_val,
+                none_val,
+            ):
+                return "success"
+
+            test_function(
+                42, 3.14, True, "hello", [1, 2, 3], (4, 5, 6), {"key": "value"}, None
+            )
+
+            # Check log contents
+            with open(log_file, "r") as f:
+                log_contents = f.read()
+
+            # Should log all types correctly
+            assert "42" in log_contents
+            assert "3.14" in log_contents
+            assert "True" in log_contents
+            assert "'hello'" in log_contents
+            assert "None" in log_contents
+        finally:
+            Path(log_file).unlink(missing_ok=True)
+
+    def test_tensor_metadata(self):
+        """Test that tensor metadata is logged correctly."""
+        with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as f:
+            log_file = f.name
+
+        try:
+            decorator = self.setup_logging(level=3, dest=log_file)
+
+            @decorator
+            def test_function(tensor):
+                return tensor
+
+            # Create a tensor with specific properties
+            tensor = torch.randn(2, 3, 4, dtype=torch.float32, device="cpu")
+            tensor = tensor.contiguous()
+            tensor.requires_grad = False
+
+            test_function(tensor)
+
+            # Check log contents
+            with open(log_file, "r") as f:
+                log_contents = f.read()
+
+            # Should log all metadata
+            assert "shape=(2, 3, 4)" in log_contents
+            assert "dtype=torch.float32" in log_contents
+            assert "device=cpu" in log_contents
+            assert "requires_grad=False" in log_contents
+            assert "is_contiguous=True" in log_contents
+            assert "stride=" in log_contents
+        finally:
+            Path(log_file).unlink(missing_ok=True)
+
+    def test_nested_structures(self):
+        """Test logging of nested data structures."""
+        with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as f:
+            log_file = f.name
+
+        try:
+            decorator = self.setup_logging(level=3, dest=log_file)
+
+            @decorator
+            def test_function(nested):
+                return nested
+
+            # Create nested structure
+            nested = {
+                "list": [1, 2, 3],
+                "dict": {"inner": "value"},
+                "tuple": (4, 5),
+            }
+
+            test_function(nested)
+
+            # Check log contents
+            with open(log_file, "r") as f:
+                log_contents = f.read()
+
+            # Should handle nested structures
+            assert "list" in log_contents
+            assert "dict" in log_contents
+            assert "tuple" in log_contents
+        finally:
+            Path(log_file).unlink(missing_ok=True)
+
+    def test_decorator_with_and_without_parentheses(self):
+        """Test that decorator works both as @decorator and @decorator()."""
+        with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as f:
+            log_file = f.name
+
+        try:
+            decorator = self.setup_logging(level=1, dest=log_file)
+
+            # Without parentheses
+            @decorator
+            def func1(x):
+                return x + 1
+
+            # With parentheses
+            @decorator()
+            def func2(x):
+                return x + 2
+
+            result1 = func1(10)
+            result2 = func2(20)
+
+            assert result1 == 11
+            assert result2 == 22
+
+            # Check log contents
+            with open(log_file, "r") as f:
+                log_contents = f.read()
+
+            assert "func1" in log_contents
+            assert "func2" in log_contents
+        finally:
+            Path(log_file).unlink(missing_ok=True)
+
+    def test_multiple_calls_same_function(self):
+        """Test that multiple calls to the same function are all logged."""
+        with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as f:
+            log_file = f.name
+
+        try:
+            decorator = self.setup_logging(level=1, dest=log_file)
+
+            @decorator
+            def test_function(x):
+                return x
+
+            # Call multiple times
+            for i in range(3):
+                test_function(i)
+
+            # Check log contents
+            with open(log_file, "r") as f:
+                log_contents = f.read()
+
+            # Should have 3 log entries
+            assert log_contents.count("FlashInfer API Call: test_function") == 3
+        finally:
+            Path(log_file).unlink(missing_ok=True)
+
+    def test_kwargs_logging(self):
+        """Test that keyword arguments are logged correctly."""
+        with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as f:
+            log_file = f.name
+
+        try:
+            decorator = self.setup_logging(level=3, dest=log_file)
+
+            @decorator
+            def test_function(a, b, c):
+                return a + b + c
+
+            # Call with keyword arguments
+            result = test_function(a=1, b=2, c=3)
+            assert result == 6
+
+            # Check log contents
+            with open(log_file, "r") as f:
+                log_contents = f.read()
+
+            # Should log keyword arguments
+            assert "Keyword input arguments:" in log_contents
+            assert "a=" in log_contents
+            assert "b=" in log_contents
+            assert "c=" in log_contents
+        finally:
+            Path(log_file).unlink(missing_ok=True)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_cuda_graph_compatibility(self):
+        """Test that level 5 logging is compatible with CUDA graph capture."""
+        with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as f:
+            log_file = f.name
+
+        try:
+            decorator = self.setup_logging(level=5, dest=log_file)
+
+            @decorator
+            def test_cuda_function(tensor):
+                return tensor * 2.0
+
+            # Create a CUDA tensor
+            tensor = torch.randn(10, 10, device="cuda")
+
+            # Test 1: Normal execution (should have statistics)
+            test_cuda_function(tensor)
+
+            with open(log_file, "r") as f:
+                log_normal = f.read()
+
+            # Should have statistics in normal execution
+            # (unless PyTorch version is too old)
+            if hasattr(torch.cuda, "is_current_stream_capturing"):
+                # Normal execution should have min/max OR statistics error
+                has_stats = "min=" in log_normal or "statistics error" in log_normal
+                assert has_stats, "Expected statistics or error in normal execution"
+
+            # Clear log file
+            with open(log_file, "w") as f:
+                f.write("")
+
+            # Test 2: CUDA graph capture (should skip statistics)
+            if hasattr(torch.cuda, "CUDAGraph"):
+                graph = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(graph):
+                    test_cuda_function(tensor)
+
+                with open(log_file, "r") as f:
+                    log_capture = f.read()
+
+                # Should skip statistics during capture
+                assert (
+                    "[statistics skipped: CUDA graph capture in progress]"
+                    in log_capture
+                    or "statistics" not in log_capture
+                ), "Expected statistics to be skipped during CUDA graph capture"
+        finally:
+            Path(log_file).unlink(missing_ok=True)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/utils/test_sampling.py b/tests/utils/test_sampling.py
index 20df72b55d..26225e2a39 100644
--- a/tests/utils/test_sampling.py
+++ b/tests/utils/test_sampling.py
@@ -61,6 +61,7 @@ def test_softmax(
         num_inf = torch.randint(0, logits.numel() - 1, (), device=logits.device).item()
         inf_idx = torch.randperm(logits.numel(), device=logits.device)[:num_inf]
         logits.view(-1).index_fill_(0, inf_idx, float("-inf"))
+        torch.cuda.synchronize()  # wait for the index_fill_ to finish because it can overlap with the softmax kernel
 
     if temperature_arr:
         temperature_arr = torch.full((batch_size,), temperature, device="cuda:0")
@@ -571,7 +572,7 @@ def test_chain_speculative_sampling(
 @pytest.mark.parametrize("batch_size", [1, 99, 989])
 @pytest.mark.parametrize("vocab_size", [111, 32000, 128256])
 @pytest.mark.parametrize("p", [0.05, 0.1, 0.2, 0.7, 1])
-def test_check_tensor_param_min_p(batch_size, vocab_size, p):
+def test_tensor_validation_min_p(batch_size, vocab_size, p):
     pre_norm_prob = torch.rand(batch_size, vocab_size, device="cuda:0")
     normalized_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
 
@@ -586,7 +587,7 @@ def test_check_tensor_param_min_p(batch_size, vocab_size, p):
         flashinfer.sampling.min_p_sampling_from_probs(
             normalized_prob,
             torch.tensor(
-                [[p] * vocab_size] * batch_size, dtype=torch.int, device="cuda:0"
+                [[p] * vocab_size] * batch_size, dtype=torch.float32, device="cuda:0"
             ),
         )
 
@@ -596,22 +597,33 @@ def test_check_tensor_param_min_p(batch_size, vocab_size, p):
         match=r"Expected a 1D tensor of shape \(batch_size,\) or scalar.*got a 0-dimensional tensor",
     ):
         flashinfer.sampling.min_p_sampling_from_probs(
-            normalized_prob, torch.tensor(p, dtype=torch.int, device="cuda:0")
+            normalized_prob, torch.tensor(p, dtype=torch.float32, device="cuda:0")
         )
 
-    # 4: 1D tensor with a broken batch size raises error (only when batch_size > 1).
+    # 4: non-int32 indices raises error.
+    with pytest.raises(
+        RuntimeError,
+        match=r"(Inconsistency of Tensor type.*maybe_indices)",
+    ):
+        flashinfer.sampling.min_p_sampling_from_probs(
+            normalized_prob,
+            torch.tensor([p] * batch_size, dtype=torch.float32, device="cuda:0"),
+            torch.tensor([p] * batch_size, dtype=torch.int64, device="cuda:0"),
+        )
+
+    # 5: 1D tensor with a broken batch size raises error (only when batch_size > 1).
     if batch_size > 1:
         with pytest.raises(
             ValueError, match="Sampling parameter tensor batch size mismatch"
         ):
             flashinfer.sampling.min_p_sampling_from_probs(
-                normalized_prob, torch.tensor([p], dtype=torch.int, device="cuda:0")
+                normalized_prob, torch.tensor([p], dtype=torch.float32, device="cuda:0")
             )
 
-    # 5: 1D tensor with the correct batch size works.
+    # 6: 1D tensor with the correct batch size works.
     samples = flashinfer.sampling.min_p_sampling_from_probs(
         normalized_prob,
-        torch.tensor([p] * batch_size, dtype=torch.int, device="cuda:0"),
+        torch.tensor([p] * batch_size, dtype=torch.float32, device="cuda:0"),
     )
     assert samples.shape == (batch_size,)
 
@@ -649,9 +661,7 @@ def test_check_tensor_param_top_p(batch_size, vocab_size, p):
 
     # 4: 1D tensor with a broken batch size raises error (only when batch_size > 1).
     if batch_size > 1:
-        with pytest.raises(
-            ValueError, match="Sampling parameter tensor batch size mismatch"
-        ):
+        with pytest.raises(ValueError, match="Sampling parameter.*batch size mismatch"):
             flashinfer.sampling.top_p_renorm_probs(
                 normalized_prob, torch.tensor([p], dtype=torch.int, device="cuda:0")
             )
@@ -699,9 +709,7 @@ def test_check_tensor_param_top_k(batch_size, vocab_size, k):
 
     # 4: 1D tensor with a wrong shape raises error (only when batch_size > 1).
     if batch_size > 1:
-        with pytest.raises(
-            ValueError, match="Sampling parameter tensor batch size mismatch"
-        ):
+        with pytest.raises(ValueError, match="Sampling parameter.*batch size mismatch"):
             flashinfer.sampling.top_k_renorm_probs(
                 normalized_prob, torch.tensor([k], dtype=torch.int, device="cuda:0")
             )
@@ -714,6 +722,84 @@ def test_check_tensor_param_top_k(batch_size, vocab_size, k):
     assert samples.shape == normalized_prob.shape
 
 
+@pytest.mark.parametrize("batch_size", [1, 99, 989])
+@pytest.mark.parametrize("vocab_size", [111, 32000, 128256])
+def test_sampling_from_probs_seed_offset_reproducibility(batch_size, vocab_size):
+    """Test that explicit seed/offset produces reproducible results."""
+    torch.manual_seed(42)
+    pre_norm_prob = torch.rand(batch_size, vocab_size, device="cuda:0")
+    normalized_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
+
+    seed, offset = 12345, 0
+
+    samples1 = flashinfer.sampling.sampling_from_probs(
+        normalized_prob, seed=seed, offset=offset
+    )
+    samples2 = flashinfer.sampling.sampling_from_probs(
+        normalized_prob, seed=seed, offset=offset
+    )
+
+    assert torch.all(samples1 == samples2), (
+        "Same seed/offset should produce identical samples"
+    )
+
+
+@pytest.mark.parametrize("batch_size", [1, 99, 989])
+@pytest.mark.parametrize("vocab_size", [111, 32000, 128256])
+def test_sampling_from_logits_seed_offset_reproducibility(batch_size, vocab_size):
+    """Test that explicit seed/offset produces reproducible results."""
+    torch.manual_seed(42)
+    logits = torch.randn(batch_size, vocab_size, device="cuda:0")
+
+    seed, offset = 12345, 0
+
+    samples1 = flashinfer.sampling.sampling_from_logits(
+        logits, seed=seed, offset=offset
+    )
+    samples2 = flashinfer.sampling.sampling_from_logits(
+        logits, seed=seed, offset=offset
+    )
+
+    assert torch.all(samples1 == samples2), (
+        "Same seed/offset should produce identical samples"
+    )
+
+
+@pytest.mark.parametrize("vocab_size", [111, 32000, 128256])
+def test_sampling_different_seed_offset_produces_different_results(vocab_size):
+    """Test that different seed/offset values produce different samples."""
+    torch.manual_seed(42)
+    batch_size = 1000
+    pre_norm_prob = torch.rand(batch_size, vocab_size, device="cuda:0")
+    normalized_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
+
+    samples_seed1 = flashinfer.sampling.sampling_from_probs(
+        normalized_prob, seed=12345, offset=0
+    )
+    samples_seed2 = flashinfer.sampling.sampling_from_probs(
+        normalized_prob, seed=67890, offset=0
+    )
+
+    samples_offset1 = flashinfer.sampling.sampling_from_probs(
+        normalized_prob, seed=12345, offset=0
+    )
+    samples_offset2 = flashinfer.sampling.sampling_from_probs(
+        normalized_prob, seed=12345, offset=1000
+    )
+
+    seed_match_rate = (samples_seed1 == samples_seed2).float().mean().item()
+    offset_match_rate = (samples_offset1 == samples_offset2).float().mean().item()
+
+    assert seed_match_rate < 1, (
+        f"Different seeds should produce mostly different samples, "
+        f"got {seed_match_rate:.2%} match rate"
+    )
+    assert offset_match_rate < 1, (
+        f"Different offsets should produce mostly different samples, "
+        f"got {offset_match_rate:.2%} match rate"
+    )
+
+
 if __name__ == "__main__":
     # test_sampling_freq(128256, gumbel_distribution(0.1), 0.5)
     test_sampling_from_logits_freq(128256, gumbel_distribution(0.1))
diff --git a/version.txt b/version.txt
index 267577d47e..be14282b7f 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.4.1
+0.5.3